[llvm] 5b40015 - [LoopVectorize] Convert some tests to opaque pointers (NFC)

Wed Dec 14 06:27:45 PST 2022

Author: Nikita Popov
Date: 2022-12-14T15:27:31+01:00
New Revision: 5b40015063f6ad267ba844f6592077ab8d3b90d3

URL: https://github.com/llvm/llvm-project/commit/5b40015063f6ad267ba844f6592077ab8d3b90d3
DIFF: https://github.com/llvm/llvm-project/commit/5b40015063f6ad267ba844f6592077ab8d3b90d3.diff

LOG: [LoopVectorize] Convert some tests to opaque pointers (NFC)

For these tests update_test_checks.py had to be rerun.

Added: 
    

Modified: 
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
    llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
    llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
    llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
    llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
    llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
    llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
    llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
    llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
    llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
    llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
    llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
    llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
    llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
    llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
    llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
    llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
    llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
    llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
    llvm/test/Transforms/LoopVectorize/X86/optsize.ll
    llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
    llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
    llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
    llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
    llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
    llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
    llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
    llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
    llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
    llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
    llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
    llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
    llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
    llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
    llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
    llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
    llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
    llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
    llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
    llvm/test/Transforms/LoopVectorize/induction.ll
    llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
    llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
    llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
    llvm/test/Transforms/LoopVectorize/loop-form.ll
    llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
    llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
    llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
    llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
    llvm/test/Transforms/LoopVectorize/pr45259.ll
    llvm/test/Transforms/LoopVectorize/pr50686.ll
    llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
    llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index 20edd8b9eb8d..6693b9b10a65 100644

--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -8,7 +8,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; the loop, preventing the gep (and consequently the loop induction
 ; update variable) from being classified as 'uniform'.
 
-define void @test_no_scalarization(i64* %a, i32 %idx, i32 %n) #0 {
+define void @test_no_scalarization(ptr %a, i32 %idx, i32 %n) #0 {
 ; CHECK-LABEL: @test_no_scalarization(
 ; CHECK-NEXT:  L.entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[IDX:%.*]], 1
@@ -39,23 +39,21 @@ define void @test_no_scalarization(i64* %a, i32 %idx, i32 %n) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[A:%.*]], <vscale x 2 x i32> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x i64*> [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64* [[TMP13]] to double*
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, double* [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double* [[TMP15]] to <vscale x 2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* [[TMP16]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP18]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[A:%.*]], <vscale x 2 x i32> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x ptr> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP16]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[DOTSPLAT2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 2 x i64*> [[TMP12]], i32 [[TMP22]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP19]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x ptr> [[TMP12]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[L_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -64,14 +62,13 @@ define void @test_no_scalarization(i64* %a, i32 %idx, i32 %n) #0 {
 ; CHECK:       L.LoopBody:
 ; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[L_LOOPBODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INDVAR_NEXT]] = add nsw i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i64, i64* [[A]], i32 [[INDVAR]]
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i64* [[TMP24]] to double*
-; CHECK-NEXT:    [[TMP26:%.*]] = load double, double* [[TMP25]], align 8
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp slt i32 [[INDVAR_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP27]], label [[L_LOOPBODY]], label [[L_EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i64, ptr [[A]], i32 [[INDVAR]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load double, ptr [[TMP22]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp slt i32 [[INDVAR_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[L_LOOPBODY]], label [[L_EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       L.exit:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i64* [ [[TMP24]], [[L_LOOPBODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    store i64 1, i64* [[DOTLCSSA]], align 8
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP22]], [[L_LOOPBODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i64 1, ptr [[DOTLCSSA]], align 8
 ; CHECK-NEXT:    ret void
 ;
 L.entry:
@@ -80,14 +77,13 @@ L.entry:
 L.LoopBody:                                       ; preds = %L.LoopBody, %L.entry
   %indvar = phi i32 [ %indvar.next, %L.LoopBody ], [ %idx, %L.entry ]
   %indvar.next = add nsw i32 %indvar, 1
-  %0 = getelementptr i64, i64* %a, i32 %indvar
-  %1 = bitcast i64* %0 to double*
-  %2 = load double, double* %1, align 8
-  %3 = icmp slt i32 %indvar.next, %n
-  br i1 %3, label %L.LoopBody, label %L.exit
+  %0 = getelementptr i64, ptr %a, i32 %indvar
+  %1 = load double, ptr %0, align 8
+  %2 = icmp slt i32 %indvar.next, %n
+  br i1 %2, label %L.LoopBody, label %L.exit
 
 L.exit:                                       ; preds = %L.LoopBody
-  store i64 1, i64* %0, align 8
+  store i64 1, ptr %0, align 8
   ret void
 }
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
index 8e11a1c98824..52af3bfdd603 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=scalar-epilogue -mtriple aarch64-unknown-linux-gnu \
 ; RUN:   -mattr=+sve -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s
 
-define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias nocapture readonly %cond, i64 %N){
+define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %cond, i64 %N){
 ; CHECK-LABEL: @cond_fadd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -17,43 +17,41 @@ define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias noc
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP12]])
-; CHECK-NEXT:    [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[COND:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = select fast <vscale x 4 x i1> [[TMP7]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP10]])
+; CHECK-NEXT:    [[TMP12]] = fadd fast float [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[INDVARS]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP18]], 2.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[COND]], i64 [[INDVARS]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP16]], 2.000000e+00
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[FADD:%.*]] = fadd fast float [[RDX]], [[TMP19]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[FADD:%.*]] = fadd fast float [[RDX]], [[TMP17]]
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[RES]] = phi float [ [[FADD]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
@@ -61,7 +59,7 @@ define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias noc
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[RES_LCSSA]]
 ;
 entry:
@@ -70,14 +68,14 @@ entry:
 for.body:
   %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.inc ]
   %rdx = phi float [ 1.000000e+00, %entry ], [ %res, %for.inc ]
-  %arrayidx = getelementptr inbounds float, float* %cond, i64 %indvars
-  %0 = load float, float* %arrayidx
+  %arrayidx = getelementptr inbounds float, ptr %cond, i64 %indvars
+  %0 = load float, ptr %arrayidx
   %tobool = fcmp une float %0, 2.000000e+00
   br i1 %tobool, label %if.then, label %for.inc
 
 if.then:
-  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars
-  %1 = load float, float* %arrayidx2
+  %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars
+  %1 = load float, ptr %arrayidx2
   %fadd = fadd fast float %rdx, %1
   br label %for.inc
 
@@ -91,7 +89,7 @@ for.end:
   ret float %res
 }
 
-define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) {
+define float @cond_cmp_sel(ptr noalias %a, ptr noalias %cond, i64 %N) {
 ; CHECK-LABEL: @cond_cmp_sel(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -108,24 +106,22 @@ define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 0x7FF0000000000000, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP12]])
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP13]], [[VEC_PHI]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP13]], float [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[COND:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = select fast <vscale x 4 x i1> [[TMP7]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 0x7FF0000000000000, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP10]])
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP11]], float [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -136,15 +132,15 @@ define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP17]], 3.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP15]], 3.000000e+00
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[FCMP:%.*]] = fcmp fast olt float [[RDX]], [[TMP18]]
-; CHECK-NEXT:    [[FSEL:%.*]] = select fast i1 [[FCMP]], float [[RDX]], float [[TMP18]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[FCMP:%.*]] = fcmp fast olt float [[RDX]], [[TMP16]]
+; CHECK-NEXT:    [[FSEL:%.*]] = select fast i1 [[FCMP]], float [[RDX]], float [[TMP16]]
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[RES]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[FSEL]], [[IF_THEN]] ]
@@ -161,14 +157,14 @@ entry:
 for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
   %rdx = phi float [ %res, %for.inc ], [ 1.000000e+00, %entry ]
-  %arrayidx = getelementptr inbounds float, float* %cond, i64 %iv
-  %0 = load float, float* %arrayidx
+  %arrayidx = getelementptr inbounds float, ptr %cond, i64 %iv
+  %0 = load float, ptr %arrayidx
   %tobool = fcmp une float %0, 3.000000e+00
   br i1 %tobool, label %if.then, label %for.inc
 
 if.then:
-  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
-  %1 = load float, float* %arrayidx2
+  %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %iv
+  %1 = load float, ptr %arrayidx2
   %fcmp = fcmp fast olt float %rdx, %1
   %fsel = select fast i1 %fcmp, float %rdx, float %1
   br label %for.inc

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
index fbc436fc87d1..bbeda2af2bab 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
@@ -4,7 +4,7 @@
 ;
 ; In-loop integer and reduction
 ;
-define i64 @int_reduction_and(i64* noalias nocapture %a, i64 %N) {
+define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-LABEL: @int_reduction_and(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
@@ -22,35 +22,33 @@ define i64 @int_reduction_and(i64* noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 1, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -1, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 1, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -1, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <vscale x 2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <vscale x 2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP19]] = and i64 [[TMP18]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD3]])
-; CHECK-NEXT:    [[TMP21]] = and i64 [[TMP20]], [[VEC_PHI2]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP17]] = and i64 [[TMP16]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD3]])
+; CHECK-NEXT:    [[TMP19]] = and i64 [[TMP18]], [[VEC_PHI2]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = and i64 [[TMP21]], [[TMP19]]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and i64 [[TMP19]], [[TMP17]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
@@ -64,36 +62,35 @@ define i64 @int_reduction_and(i64* noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4
-; CHECK-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD9]])
-; CHECK-NEXT:    [[TMP30]] = and i64 [[TMP29]], [[VEC_PHI8]]
-; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX7]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD9]])
+; CHECK-NEXT:    [[TMP27]] = and i64 [[TMP26]], [[VEC_PHI8]]
+; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX11:%.*]] = phi i64 [ 1, [[ITER_CHECK]] ], [ [[BIN_RDX]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP30]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX11:%.*]] = phi i64 [ 1, [[ITER_CHECK]] ], [ [[BIN_RDX]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i64, i64* [[L2]], align 4
+; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[L3:%.*]] = load i64, ptr [[L2]], align 4
 ; CHECK-NEXT:    [[AND]] = and i64 [[RDX]], [[L3]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[AND_LCSSA:%.*]] = phi i64 [ [[AND]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[TMP30]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[AND_LCSSA:%.*]] = phi i64 [ [[AND]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[AND_LCSSA]]
 ;
 entry:
@@ -102,8 +99,8 @@ entry:
 for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %rdx = phi i64 [ %and, %for.body ], [ 1, %entry ]
-  %l2 = getelementptr inbounds i64, i64* %a, i64 %iv
-  %l3 = load i64, i64* %l2
+  %l2 = getelementptr inbounds i64, ptr %a, i64 %iv
+  %l3 = load i64, ptr %l2
   %and = and i64 %rdx, %l3
   %iv.next = add i64 %iv, 1
   %exitcond = icmp eq i64 %iv.next, %N

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
index 18e5ccae271b..82119bab1568 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
@@ -4,7 +4,7 @@
 ;
 ; Integer reduction with interleaving & a start value of 5
 ;
-define i64 @int_reduction_add(i64* %a, i64 %N) {
+define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-LABEL: @int_reduction_add(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
@@ -22,34 +22,32 @@ define i64 @int_reduction_add(i64* %a, i64 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ insertelement (<vscale x 2 x i64> zeroinitializer, i64 5, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ insertelement (<vscale x 2 x i64> zeroinitializer, i64 5, i32 0), [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <vscale x 2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <vscale x 2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP19]] = add <vscale x 2 x i64> [[WIDE_LOAD3]], [[VEC_PHI2]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP17]] = add <vscale x 2 x i64> [[WIDE_LOAD3]], [[VEC_PHI2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
@@ -57,43 +55,42 @@ define i64 @int_reduction_add(i64* %a, i64 %N) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP23]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP21]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[N]], 2
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]]
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <2 x i64> [ [[TMP24]], [[VEC_EPILOG_PH]] ], [ [[TMP29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4
-; CHECK-NEXT:    [[TMP29]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]]
-; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <2 x i64> [ [[TMP22]], [[VEC_EPILOG_PH]] ], [ [[TMP26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX7]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP26]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]]
+; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP29]])
+; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP26]])
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX11:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[TMP23]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP31]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX11:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[TMP21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i64, i64* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add i64 [[TMP32]], [[SUM]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add i64 [[TMP29]], [[SUM]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ [[TMP31]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[ADD_LCSSA]]
 ;
 entry:
@@ -102,8 +99,8 @@ entry:
 for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %sum = phi i64 [ 5, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %iv
-  %0 = load i64, i64* %arrayidx
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
+  %0 = load i64, ptr %arrayidx
   %add = add i64 %0, %sum
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %N

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
index 6220034dcd78..1648a2f6bd1b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
@@ -4,7 +4,7 @@
 ;
 ; Strict fadd reduction with interleaving
 ;
-define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
+define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) {
 ; CHECK-LABEL: @fadd_strict(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
@@ -22,30 +22,28 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[TMP18]], <vscale x 4 x float> [[WIDE_LOAD2]])
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[TMP16]], <vscale x 4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -54,41 +52,40 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF3:%.*]] = urem i64 [[N]], 2
 ; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[N]], [[N_MOD_VF3]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi float [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x float>, <2 x float>* [[TMP26]], align 4
-; CHECK-NEXT:    [[TMP27]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI7]], <2 x float> [[WIDE_LOAD8]])
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
-; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi float [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX6]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x float>, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI7]], <2 x float> [[WIDE_LOAD8]])
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N5:%.*]] = icmp eq i64 [[N]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[CMP_N5]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[ITER_CHECK]] ], [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd float [[TMP29]], [[SUM_07]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = fadd float [[TMP26]], [[SUM_07]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[ADD_LCSSA]]
 ;
 entry:
@@ -97,8 +94,8 @@ entry:
 for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %sum.07 = phi float [ 0xFFFFFFFFE0000000, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
-  %0 = load float, float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
+  %0 = load float, ptr %arrayidx, align 4
   %add = fadd float %0, %sum.07
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %n

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
index 50f1f7cdc183..a110cb0e4849 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
@@ -3,7 +3,7 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N) #0 {
+define void @inv_store_i16(ptr noalias %dst, ptr noalias readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @inv_store_i16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -19,20 +19,19 @@ define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[SRC:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <vscale x 4 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 [[TMP9]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP10]]
-; CHECK-NEXT:    store i16 [[TMP11]], i16* [[DST:%.*]], align 2
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[SRC:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP9]]
+; CHECK-NEXT:    store i16 [[TMP10]], ptr [[DST:%.*]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_INC24:%.*]], label [[SCALAR_PH]]
@@ -41,9 +40,9 @@ define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N
 ; CHECK-NEXT:    br label [[FOR_BODY14:%.*]]
 ; CHECK:       for.body14:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY14]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[SRC]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; CHECK-NEXT:    store i16 [[LD]], i16* [[DST]], align 2
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[LD:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    store i16 [[LD]], ptr [[DST]], align 2
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_INC24]], label [[FOR_BODY14]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -55,9 +54,9 @@ entry:
 
 for.body14:                                       ; preds = %for.body14, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body14 ]
-  %arrayidx = getelementptr inbounds i16, i16* %src, i64 %indvars.iv
-  %ld = load i16, i16* %arrayidx
-  store i16 %ld, i16* %dst, align 2
+  %arrayidx = getelementptr inbounds i16, ptr %src, i64 %indvars.iv
+  %ld = load i16, ptr %arrayidx
+  store i16 %ld, ptr %dst, align 2
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %N
   br i1 %exitcond.not, label %for.inc24, label %for.body14, !llvm.loop !0
@@ -67,7 +66,7 @@ for.inc24:                                        ; preds = %for.body14, %for.bo
 }
 
 
-define void @cond_inv_store_i32(i32* noalias %dst, i32* noalias readonly %src, i64 %N) #0 {
+define void @cond_inv_store_i32(ptr noalias %dst, ptr noalias readonly %src, i64 %N) #0 {
 ; CHECK-LABEL: @cond_inv_store_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -79,23 +78,22 @@ define void @cond_inv_store_i32(i32* noalias %dst, i32* noalias readonly %src, i
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[DST:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP8]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -104,12 +102,12 @@ define void @cond_inv_store_i32(i32* noalias %dst, i32* noalias readonly %src, i
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_09:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[I_09]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[I_09]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    store i32 [[TMP12]], i32* [[DST]], align 4
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_09]], 1
@@ -123,13 +121,13 @@ entry:
 
 for.body:                                         ; preds = %entry, %for.inc
   %i.09 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %i.09
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %i.09
+  %0 = load i32, ptr %arrayidx, align 4
   %cmp1 = icmp sgt i32 %0, 0
   br i1 %cmp1, label %if.then, label %for.inc
 
 if.then:                                          ; preds = %for.body
-  store i32 %0, i32* %dst, align 4
+  store i32 %0, ptr %dst, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index 4b0e6eea3a1f..2c186cecf080 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -30,14 +30,14 @@ target triple = "aarch64-unknown-linux-gnu"
 ; VPLANS-NEXT:   No successors
 ; VPLANS-NEXT: }
 
-define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
+define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memset(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
@@ -48,35 +48,46 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]])
-; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP14]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK4]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK4]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[TMP14]], i32 0
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
 
 while.body:                                       ; preds = %while.body, %entry
   %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
-  %gep = getelementptr i32, i32* %ptr, i64 %index
-  store i32 %val, i32* %gep
+  %gep = getelementptr i32, ptr %ptr, i64 %index
+  store i32 %val, ptr %gep
   %index.next = add nsw i64 %index, 1
   %cmp10 = icmp ult i64 %index.next, %n
   br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index b6d6135ca13e..000c338b9351 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -4,7 +4,7 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 
-define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
+define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memset(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
@@ -30,17 +30,16 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[TMP14]], i32 0
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -48,8 +47,8 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -61,8 +60,8 @@ entry:
 
 while.body:                                       ; preds = %while.body, %entry
   %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
-  %gep = getelementptr i32, i32* %ptr, i64 %index
-  store i32 %val, i32* %gep
+  %gep = getelementptr i32, ptr %ptr, i64 %index
+  store i32 %val, ptr %gep
   %index.next = add nsw i64 %index, 1
   %cmp10 = icmp ult i64 %index.next, %n
   br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
@@ -72,7 +71,7 @@ while.end.loopexit:                               ; preds = %while.body
 }
 
 
-define void @simple_memset_v4i32(i32 %val, i32* %ptr, i64 %n) #0 {
+define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memset_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
@@ -89,15 +88,14 @@ define void @simple_memset_v4i32(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -105,8 +103,8 @@ define void @simple_memset_v4i32(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -118,8 +116,8 @@ entry:
 
 while.body:                                       ; preds = %while.body, %entry
   %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
-  %gep = getelementptr i32, i32* %ptr, i64 %index
-  store i32 %val, i32* %gep
+  %gep = getelementptr i32, ptr %ptr, i64 %index
+  store i32 %val, ptr %gep
   %index.next = add nsw i64 %index, 1
   %cmp10 = icmp ult i64 %index.next, %n
   br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !3
@@ -129,7 +127,7 @@ while.end.loopexit:                               ; preds = %while.body
 }
 
 
-define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
+define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memcpy(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
@@ -153,21 +151,19 @@ define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32>* [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP15]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -175,10 +171,10 @@ define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[GEP2]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -190,10 +186,10 @@ entry:
 
 while.body:                                       ; preds = %while.body, %entry
   %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
-  %gep1 = getelementptr i32, i32* %src, i64 %index
-  %val = load i32, i32* %gep1
-  %gep2 = getelementptr i32, i32* %dst, i64 %index
-  store i32 %val, i32* %gep2
+  %gep1 = getelementptr i32, ptr %src, i64 %index
+  %val = load i32, ptr %gep1
+  %gep2 = getelementptr i32, ptr %dst, i64 %index
+  store i32 %val, ptr %gep2
   %index.next = add nsw i64 %index, 1
   %cmp10 = icmp ult i64 %index.next, %n
   br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
@@ -203,7 +199,7 @@ while.end.loopexit:                               ; preds = %while.body
 }
 
 
-define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
+define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-LABEL: @copy_stride4(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 4)
@@ -240,10 +236,10 @@ define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP18]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32*> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP18]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]]
@@ -259,10 +255,10 @@ define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[GEP2]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -274,10 +270,10 @@ entry:
 
 while.body:                                       ; preds = %while.body, %entry
   %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
-  %gep1 = getelementptr i32, i32* %src, i64 %index
-  %val = load i32, i32* %gep1
-  %gep2 = getelementptr i32, i32* %dst, i64 %index
-  store i32 %val, i32* %gep2
+  %gep1 = getelementptr i32, ptr %src, i64 %index
+  %val = load i32, ptr %gep1
+  %gep2 = getelementptr i32, ptr %dst, i64 %index
+  store i32 %val, ptr %gep2
   %index.next = add nsw i64 %index, 4
   %cmp10 = icmp ult i64 %index.next, %n
   br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
@@ -287,7 +283,7 @@ while.end.loopexit:                               ; preds = %while.body
 }
 
 
-define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* noalias %ind, i64 %n) #0 {
+define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noalias %ind, i64 %n) #0 {
 ; CHECK-LABEL: @simple_gather_scatter(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
@@ -311,21 +307,20 @@ define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* no
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32*> [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[IND:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP15]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -333,12 +328,12 @@ define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* no
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[IND]], i64 [[INDEX]]
-; CHECK-NEXT:    [[IND_VAL:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[SRC]], i32 [[IND_VAL]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i32, i32* [[DST]], i32 [[IND_VAL]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[GEP3]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[IND]], i64 [[INDEX]]
+; CHECK-NEXT:    [[IND_VAL:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[IND_VAL]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IND_VAL]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -350,12 +345,12 @@ entry:
 
 while.body:                                       ; preds = %while.body, %entry
   %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
-  %gep1 = getelementptr i32, i32* %ind, i64 %index
-  %ind_val = load i32, i32* %gep1
-  %gep2 = getelementptr i32, i32* %src, i32 %ind_val
-  %val = load i32, i32* %gep2
-  %gep3 = getelementptr i32, i32* %dst, i32 %ind_val
-  store i32 %val, i32* %gep3
+  %gep1 = getelementptr i32, ptr %ind, i64 %index
+  %ind_val = load i32, ptr %gep1
+  %gep2 = getelementptr i32, ptr %src, i32 %ind_val
+  %val = load i32, ptr %gep2
+  %gep3 = getelementptr i32, ptr %dst, i32 %ind_val
+  store i32 %val, ptr %gep3
   %index.next = add nsw i64 %index, 1
   %cmp10 = icmp ult i64 %index.next, %n
   br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
@@ -367,7 +362,7 @@ while.end.loopexit:                               ; preds = %while.body
 
 ; The original loop had an unconditional uniform load. Let's make sure
 ; we don't artificially create new predicated blocks for the load.
-define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
+define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 {
 ; CHECK-LABEL: @uniform_load(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
@@ -390,20 +385,19 @@ define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n)
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[SRC:%.*]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP10]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -411,9 +405,9 @@ define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n)
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -426,9 +420,9 @@ entry:
 
 for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %val = load i32, i32* %src, align 4
-  %arrayidx = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
-  store i32 %val, i32* %arrayidx, align 4
+  %val = load i32, ptr %src, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv
+  store i32 %val, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %n
   br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
@@ -442,7 +436,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; do need to perform conditional loads and so we end up using a gather instead.
 ; However, we at least ensure the mask is the overlap of the loop predicate
 ; and the original condition.
-define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i32* noalias readonly %cond, i64 %n) #0 {
+define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr noalias readonly %cond, i64 %n) #0 {
 ; CHECK-LABEL: @cond_uniform_load(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
@@ -460,35 +454,33 @@ define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i3
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[SRC:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[SRC:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32>* [[TMP20]], i32 4, <vscale x 4 x i1> [[TMP18]])
-; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <vscale x 4 x i1> [[TMP12]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <vscale x 4 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 4
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP20]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[N]])
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 4 x i1> [[TMP23]], i32 0
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[TMP21]], i32 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -496,17 +488,17 @@ define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i3
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP25]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP23]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[SRC]], align 4
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[TMP26]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL_0]], i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[TMP24]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[VAL_0]], ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -519,19 +511,19 @@ entry:
 
 for.body:                                         ; preds = %entry, %if.end
   %index = phi i64 [ %index.next, %if.end ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %cond, i64 %index
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %cond, i64 %index
+  %0 = load i32, ptr %arrayidx, align 4
   %tobool.not = icmp eq i32 %0, 0
   br i1 %tobool.not, label %if.end, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %1 = load i32, i32* %src, align 4
+  %1 = load i32, ptr %src, align 4
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %for.body
   %val.0 = phi i32 [ %1, %if.then ], [ 0, %for.body ]
-  %arrayidx1 = getelementptr inbounds i32, i32* %dst, i64 %index
-  store i32 %val.0, i32* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %dst, i64 %index
+  store i32 %val.0, ptr %arrayidx1, align 4
   %index.next = add nuw i64 %index, 1
   %exitcond.not = icmp eq i64 %index.next, %n
   br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
@@ -543,7 +535,7 @@ for.end:                                          ; preds = %for.inc, %entry
 
 ; The original loop had an unconditional uniform store. Let's make sure
 ; we don't artificially create new predicated blocks for the load.
-define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
+define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 {
 ; CHECK-LABEL: @uniform_store(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
@@ -561,25 +553,24 @@ define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[DST:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[TMP14]], i32 0
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -587,9 +578,9 @@ define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
@@ -602,9 +593,9 @@ entry:
 
 for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
-  %val = load i32, i32* %arrayidx, align 4
-  store i32 %val, i32* %dst, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv
+  %val = load i32, ptr %arrayidx, align 4
+  store i32 %val, ptr %dst, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %n
   br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
@@ -614,7 +605,7 @@ for.end:                                          ; preds = %for.body, %entry
 }
 
 
-define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {
+define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-LABEL: @simple_fdiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
@@ -638,24 +629,21 @@ define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, float* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr float, float* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <vscale x 4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP16:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP14]] to <vscale x 4 x float>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0nxv4f32(<vscale x 4 x float> [[TMP16]], <vscale x 4 x float>* [[TMP17]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
-; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[SRC:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP14:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP14]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP16]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -663,12 +651,12 @@ define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr float, float* [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr float, float* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load float, float* [[GEP1]], align 4
-; CHECK-NEXT:    [[VAL2:%.*]] = load float, float* [[GEP2]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr float, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load float, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[VAL2:%.*]] = load float, ptr [[GEP2]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = fdiv float [[VAL1]], [[VAL2]]
-; CHECK-NEXT:    store float [[RES]], float* [[GEP2]], align 4
+; CHECK-NEXT:    store float [[RES]], ptr [[GEP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -680,12 +668,12 @@ entry:
 
 while.body:                                       ; preds = %while.body, %entry
   %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
-  %gep1 = getelementptr float, float* %src, i64 %index
-  %gep2 = getelementptr float, float* %dst, i64 %index
-  %val1 = load float, float* %gep1
-  %val2 = load float, float* %gep2
+  %gep1 = getelementptr float, ptr %src, i64 %index
+  %gep2 = getelementptr float, ptr %dst, i64 %index
+  %val1 = load float, ptr %gep1
+  %val2 = load float, ptr %gep2
   %res = fdiv float %val1, %val2
-  store float %res, float* %gep2
+  store float %res, ptr %gep2
   %index.next = add nsw i64 %index, 1
   %cmp10 = icmp ult i64 %index.next, %n
   br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
@@ -696,7 +684,7 @@ while.end.loopexit:                               ; preds = %while.body
 
 ; Integer divides can throw exceptions; if we vectorize, we must ensure
 ; that speculated lanes don't fault.
-define void @simple_idiv(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
+define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-LABEL: @simple_idiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
@@ -720,25 +708,22 @@ define void @simple_idiv(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP17:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP14]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32>* [[TMP18]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 4
-; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP20]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP14]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP15]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]])
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[TMP21]], i32 0
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -746,12 +731,12 @@ define void @simple_idiv(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT:    [[VAL2:%.*]] = load i32, i32* [[GEP2]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[VAL2:%.*]] = load i32, ptr [[GEP2]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = udiv i32 [[VAL1]], [[VAL2]]
-; CHECK-NEXT:    store i32 [[RES]], i32* [[GEP2]], align 4
+; CHECK-NEXT:    store i32 [[RES]], ptr [[GEP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP21:![0-9]+]]
@@ -763,12 +748,12 @@ entry:
 
 while.body:                                       ; preds = %while.body, %entry
   %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
-  %gep1 = getelementptr i32, i32* %src, i64 %index
-  %gep2 = getelementptr i32, i32* %dst, i64 %index
-  %val1 = load i32, i32* %gep1
-  %val2 = load i32, i32* %gep2
+  %gep1 = getelementptr i32, ptr %src, i64 %index
+  %gep2 = getelementptr i32, ptr %dst, i64 %index
+  %val1 = load i32, ptr %gep1
+  %val2 = load i32, ptr %gep2
   %res = udiv i32 %val1, %val2
-  store i32 %res, i32* %gep2
+  store i32 %res, ptr %gep2
   %index.next = add nsw i64 %index, 1
   %cmp10 = icmp ult i64 %index.next, %n
   br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
index 1485ff609f20..c4ba0839a59e 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
-define void @test_stride1_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
+define void @test_stride1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: @test_stride1_4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -18,18 +18,16 @@ define void @test_stride1_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -39,11 +37,11 @@ define void @test_stride1_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 1
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP11]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -56,11 +54,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %mul = mul nuw nsw i32 %i.023, 1
   %add5 = add nuw nsw i32 %mul, 2
-  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
-  %0 = load i32, i32* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5
+  %0 = load i32, ptr %arrayidx6, align 4
   %add7 = add nsw i32 5, %0
-  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
-  store i32 %add7, i32* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023
+  store i32 %add7, ptr %arrayidx9, align 4
   %inc = add nuw nsw i32 %i.023, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %end, label %for.body
@@ -68,7 +66,7 @@ end:                                 ; preds = %end, %entry
   ret void
 }
 
-define void @test_stride-1_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
+define void @test_stride-1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: @test_stride-1_4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4
@@ -82,20 +80,18 @@ define void @test_stride-1_4i32(i32* readonly %data, i32* noalias nocapture %dst
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[REVERSE]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[REVERSE]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
@@ -106,11 +102,11 @@ define void @test_stride-1_4i32(i32* readonly %data, i32* noalias nocapture %dst
 ; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], -1
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP10]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -123,11 +119,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %mul = mul nuw nsw i32 %i.023, -1
   %add5 = add nuw nsw i32 %mul, 2
-  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
-  %0 = load i32, i32* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5
+  %0 = load i32, ptr %arrayidx6, align 4
   %add7 = add nsw i32 5, %0
-  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
-  store i32 %add7, i32* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023
+  store i32 %add7, ptr %arrayidx9, align 4
   %inc = add nuw nsw i32 %i.023, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %end, label %for.body
@@ -135,7 +131,7 @@ end:                                 ; preds = %end, %entry
   ret void
 }
 
-define void @test_stride2_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
+define void @test_stride2_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ;
 ; CHECK-LABEL: @test_stride2_4i32(
 ; CHECK-NEXT:  entry:
@@ -152,19 +148,17 @@ define void @test_stride2_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -174,11 +168,11 @@ define void @test_stride2_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 2
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP13]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP11]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -191,11 +185,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %mul = mul nuw nsw i32 %i.023, 2
   %add5 = add nuw nsw i32 %mul, 2
-  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
-  %0 = load i32, i32* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5
+  %0 = load i32, ptr %arrayidx6, align 4
   %add7 = add nsw i32 5, %0
-  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
-  store i32 %add7, i32* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023
+  store i32 %add7, ptr %arrayidx9, align 4
   %inc = add nuw nsw i32 %i.023, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %end, label %for.body
@@ -203,7 +197,7 @@ end:                                 ; preds = %end, %entry
   ret void
 }
 
-define void @test_stride3_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
+define void @test_stride3_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: @test_stride3_4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -219,17 +213,16 @@ define void @test_stride3_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP2]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -239,11 +232,11 @@ define void @test_stride3_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 3
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -256,11 +249,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %mul = mul nuw nsw i32 %i.023, 3
   %add5 = add nuw nsw i32 %mul, 2
-  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
-  %0 = load i32, i32* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5
+  %0 = load i32, ptr %arrayidx6, align 4
   %add7 = add nsw i32 5, %0
-  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
-  store i32 %add7, i32* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023
+  store i32 %add7, ptr %arrayidx9, align 4
   %inc = add nuw nsw i32 %i.023, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %end, label %for.body
@@ -268,7 +261,7 @@ end:                                 ; preds = %end, %entry
   ret void
 }
 
-define void @test_stride4_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
+define void @test_stride4_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: @test_stride4_4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -284,17 +277,16 @@ define void @test_stride4_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP2]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -304,11 +296,11 @@ define void @test_stride4_4i32(i32* readonly %data, i32* noalias nocapture %dst,
 ; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 4
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -321,11 +313,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %mul = mul nuw nsw i32 %i.023, 4
   %add5 = add nuw nsw i32 %mul, 2
-  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
-  %0 = load i32, i32* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5
+  %0 = load i32, ptr %arrayidx6, align 4
   %add7 = add nsw i32 5, %0
-  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
-  store i32 %add7, i32* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023
+  store i32 %add7, ptr %arrayidx9, align 4
   %inc = add nuw nsw i32 %i.023, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %end, label %for.body
@@ -333,7 +325,7 @@ end:                                 ; preds = %end, %entry
   ret void
 }
 
-define void @test_stride_loopinvar_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n, i32 %stride) {
+define void @test_stride_loopinvar_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n, i32 %stride) {
 ; CHECK-LABEL: @test_stride_loopinvar_4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
@@ -351,18 +343,16 @@ define void @test_stride_loopinvar_4i32(i32* readonly %data, i32* noalias nocapt
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], [[STRIDE]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -372,11 +362,11 @@ define void @test_stride_loopinvar_4i32(i32* readonly %data, i32* noalias nocapt
 ; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP11]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -389,11 +379,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %mul = mul nuw nsw i32 %i.023, %stride
   %add5 = add nuw nsw i32 %mul, 2
-  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
-  %0 = load i32, i32* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5
+  %0 = load i32, ptr %arrayidx6, align 4
   %add7 = add nsw i32 5, %0
-  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
-  store i32 %add7, i32* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023
+  store i32 %add7, ptr %arrayidx9, align 4
   %inc = add nuw nsw i32 %i.023, 1
   %exitcond.not = icmp eq i32 %inc, %n
   br i1 %exitcond.not, label %end, label %for.body
@@ -401,7 +391,7 @@ end:                                 ; preds = %end, %entry
   ret void
 }
 
-define void @test_stride_noninvar_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
+define void @test_stride_noninvar_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: @test_stride_noninvar_4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4
@@ -419,18 +409,17 @@ define void @test_stride_noninvar_4i32(i32* readonly %data, i32* noalias nocaptu
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw <4 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP3]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 32, i32 32, i32 32, i32 32>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
@@ -443,11 +432,11 @@ define void @test_stride_noninvar_4i32(i32* readonly %data, i32* noalias nocaptu
 ; CHECK-NEXT:    [[STRIDE:%.*]] = phi i32 [ [[NEXT_STRIDE:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP10]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
 ; CHECK-NEXT:    [[NEXT_STRIDE]] = add nuw nsw i32 [[STRIDE]], 8
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
@@ -462,11 +451,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %stride = phi i32 [ %next.stride, %for.body ], [ 3, %entry ]
   %mul = mul nuw nsw i32 %i.023, %stride
   %add5 = add nuw nsw i32 %mul, 2
-  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
-  %0 = load i32, i32* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5
+  %0 = load i32, ptr %arrayidx6, align 4
   %add7 = add nsw i32 5, %0
-  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
-  store i32 %add7, i32* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023
+  store i32 %add7, ptr %arrayidx9, align 4
   %inc = add nuw nsw i32 %i.023, 1
   %next.stride = add nuw nsw i32 %stride, 8
   %exitcond.not = icmp eq i32 %inc, %n
@@ -475,7 +464,7 @@ end:                                 ; preds = %end, %entry
   ret void
 }
 
-define void @test_stride_noninvar2_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n) {
+define void @test_stride_noninvar2_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n) {
 ; CHECK-LABEL: @test_stride_noninvar2_4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -484,11 +473,11 @@ define void @test_stride_noninvar2_4i32(i32* readonly %data, i32* noalias nocapt
 ; CHECK-NEXT:    [[STRIDE:%.*]] = phi i32 [ [[NEXT_STRIDE:%.*]], [[FOR_BODY]] ], [ 3, [[ENTRY]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_023]]
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
 ; CHECK-NEXT:    [[NEXT_STRIDE]] = mul nuw nsw i32 [[STRIDE]], 8
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
@@ -503,11 +492,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %stride = phi i32 [ %next.stride, %for.body ], [ 3, %entry ]
   %mul = mul nuw nsw i32 %i.023, %stride
   %add5 = add nuw nsw i32 %mul, 2
-  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
-  %0 = load i32, i32* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5
+  %0 = load i32, ptr %arrayidx6, align 4
   %add7 = add nsw i32 5, %0
-  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
-  store i32 %add7, i32* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023
+  store i32 %add7, ptr %arrayidx9, align 4
   %inc = add nuw nsw i32 %i.023, 1
   %next.stride = mul nuw nsw i32 %stride, 8
   %exitcond.not = icmp eq i32 %inc, %n
@@ -516,7 +505,7 @@ end:                                 ; preds = %end, %entry
   ret void
 }
 
-define void @test_stride_noninvar3_4i32(i32* readonly %data, i32* noalias nocapture %dst, i32 %n, i32 %x) {
+define void @test_stride_noninvar3_4i32(ptr readonly %data, ptr noalias nocapture %dst, i32 %n, i32 %x) {
 ; CHECK-LABEL: @test_stride_noninvar3_4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4
@@ -541,18 +530,17 @@ define void @test_stride_noninvar3_4i32(i32* readonly %data, i32* noalias nocapt
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP5]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP5]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], [[DOTSPLAT3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
@@ -565,11 +553,11 @@ define void @test_stride_noninvar3_4i32(i32* readonly %data, i32* noalias nocapt
 ; CHECK-NEXT:    [[STRIDE:%.*]] = phi i32 [ [[NEXT_STRIDE:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
 ; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP11]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
 ; CHECK-NEXT:    [[NEXT_STRIDE]] = add nuw nsw i32 [[STRIDE]], [[X]]
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
@@ -584,11 +572,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %stride = phi i32 [ %next.stride, %for.body ], [ 3, %entry ]
   %mul = mul nuw nsw i32 %i.023, %stride
   %add5 = add nuw nsw i32 %mul, 2
-  %arrayidx6 = getelementptr inbounds i32, i32* %data, i32 %add5
-  %0 = load i32, i32* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %data, i32 %add5
+  %0 = load i32, ptr %arrayidx6, align 4
   %add7 = add nsw i32 5, %0
-  %arrayidx9 = getelementptr inbounds i32, i32* %dst, i32 %i.023
-  store i32 %add7, i32* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %dst, i32 %i.023
+  store i32 %add7, ptr %arrayidx9, align 4
   %inc = add nuw nsw i32 %i.023, 1
   %next.stride = add nuw nsw i32 %stride, %x
   %exitcond.not = icmp eq i32 %inc, %n
@@ -598,7 +586,7 @@ end:                                 ; preds = %end, %entry
 }
 
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
-declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
-declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
-declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
index 74b47292b31d..3f1320d90c64 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-none-none-eabi"
 
-define i32 @mla_i32(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i32 %n) #0 {
+define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, i32 %n) #0 {
 ; CHECK-LABEL: @mla_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -18,48 +18,46 @@ define i32 @mla_i32(i8* noalias nocapture readonly %A, i8* noalias nocapture rea
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP10]])
-; CHECK-NEXT:    [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP10]] = add i32 [[TMP9]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[RES_010:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP14]] to i32
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[I_011]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP12]] to i32
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[I_011]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP13]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[RES_010]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_011]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -72,11 +70,11 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %res.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011
-  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i8, ptr %A, i32 %i.011
+  %0 = load i8, ptr %arrayidx, align 1
   %conv = sext i8 %0 to i32
-  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.011
-  %1 = load i8, i8* %arrayidx1, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.011
+  %1 = load i8, ptr %arrayidx1, align 1
   %conv2 = sext i8 %1 to i32
   %mul = mul nsw i32 %conv2, %conv
   %add = add nsw i32 %mul, %res.010
@@ -89,7 +87,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %res.0.lcssa
 }
 
-define i32 @mla_i8(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i32 %n) #0 {
+define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, i32 %n) #0 {
 ; CHECK-LABEL: @mla_i8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -103,48 +101,46 @@ define i32 @mla_i8(i8* noalias nocapture readonly %A, i8* noalias nocapture read
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP10]])
-; CHECK-NEXT:    [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP10]] = add i32 [[TMP9]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[RES_010:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP14]] to i32
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[I_011]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP12]] to i32
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[I_011]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP13]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[RES_010]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_011]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -157,11 +153,11 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %res.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011
-  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i8, ptr %A, i32 %i.011
+  %0 = load i8, ptr %arrayidx, align 1
   %conv = sext i8 %0 to i32
-  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.011
-  %1 = load i8, i8* %arrayidx1, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.011
+  %1 = load i8, ptr %arrayidx1, align 1
   %conv2 = sext i8 %1 to i32
   %mul = mul nsw i32 %conv2, %conv
   %add = add nsw i32 %mul, %res.010
@@ -174,7 +170,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %res.0.lcssa
 }
 
-define i32 @add_i32(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @add_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @add_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -188,36 +184,35 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP8]], [[R_07]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP7]], [[R_07]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -230,8 +225,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %add = add nsw i32 %0, %r.07
   %inc = add nuw nsw i32 %i.08, 1
   %exitcond = icmp eq i32 %inc, %n
@@ -242,7 +237,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @mul_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @mul_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -256,36 +251,35 @@ define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = mul nsw i32 [[TMP8]], [[R_07]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = mul nsw i32 [[TMP7]], [[R_07]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -298,8 +292,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 1, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %add = mul nsw i32 %0, %r.07
   %inc = add nuw nsw i32 %i.08, 1
   %exitcond = icmp eq i32 %inc, %n
@@ -310,7 +304,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define i32 @and_i32(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @and_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @and_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -324,36 +318,35 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = and i32 [[TMP8]], [[R_07]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = and i32 [[TMP7]], [[R_07]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -366,8 +359,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 4294967295, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %add = and i32 %0, %r.07
   %inc = add nuw nsw i32 %i.08, 1
   %exitcond = icmp eq i32 %inc, %n
@@ -378,7 +371,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define i32 @or_i32(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @or_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @or_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -392,36 +385,35 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = or i32 [[TMP8]], [[R_07]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = or i32 [[TMP7]], [[R_07]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -434,8 +426,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %add = or i32 %0, %r.07
   %inc = add nuw nsw i32 %i.08, 1
   %exitcond = icmp eq i32 %inc, %n
@@ -446,7 +438,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @xor_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @xor_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -460,36 +452,35 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = xor i32 [[TMP8]], [[R_07]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = xor i32 [[TMP7]], [[R_07]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -502,8 +493,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %add = xor i32 %0, %r.07
   %inc = add nuw nsw i32 %i.08, 1
   %exitcond = icmp eq i32 %inc, %n
@@ -514,7 +505,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define float @fadd_f32(float* nocapture readonly %x, i32 %n) #0 {
+define float @fadd_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @fadd_f32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -528,36 +519,35 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast float [[TMP8]], [[R_07]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = fadd fast float [[TMP7]], [[R_07]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -570,8 +560,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi float [ %add, %for.body ], [ 0.0, %entry ]
-  %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
-  %0 = load float, float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08
+  %0 = load float, ptr %arrayidx, align 4
   %add = fadd fast float %0, %r.07
   %inc = add nuw nsw i32 %i.08, 1
   %exitcond = icmp eq i32 %inc, %n
@@ -582,7 +572,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret float %r.0.lcssa
 }
 
-define float @fmul_f32(float* nocapture readonly %x, i32 %n) #0 {
+define float @fmul_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @fmul_f32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -596,36 +586,35 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fmul fast float [[TMP8]], [[R_07]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = fmul fast float [[TMP7]], [[R_07]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -638,8 +627,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi float [ %add, %for.body ], [ 1.0, %entry ]
-  %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
-  %0 = load float, float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08
+  %0 = load float, ptr %arrayidx, align 4
   %add = fmul fast float %0, %r.07
   %inc = add nuw nsw i32 %i.08, 1
   %exitcond = icmp eq i32 %inc, %n
@@ -650,7 +639,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret float %r.0.lcssa
 }
 
-define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @smin_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @smin_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -664,37 +653,36 @@ define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2147483647, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2147483647, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[R_07]], [[TMP8]]
-; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[R_07]], [[TMP7]]
+; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -707,8 +695,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 2147483647, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %c = icmp slt i32 %r.07, %0
   %add = select i1 %c, i32 %r.07, i32 %0
   %inc = add nuw nsw i32 %i.08, 1
@@ -720,7 +708,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @smax_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @smax_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -734,37 +722,36 @@ define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -2147483648, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -2147483648, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[R_07]], [[TMP8]]
-; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[R_07]], [[TMP7]]
+; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -777,8 +764,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ -2147483648, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %c = icmp sgt i32 %r.07, %0
   %add = select i1 %c, i32 %r.07, i32 %0
   %inc = add nuw nsw i32 %i.08, 1
@@ -790,7 +777,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @umin_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @umin_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -804,37 +791,36 @@ define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[R_07]], [[TMP8]]
-; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[R_07]], [[TMP7]]
+; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -847,8 +833,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 4294967295, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %c = icmp ult i32 %r.07, %0
   %add = select i1 %c, i32 %r.07, i32 %0
   %inc = add nuw nsw i32 %i.08, 1
@@ -860,7 +846,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @umax_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @umax_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -874,37 +860,36 @@ define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 [[R_07]], [[TMP8]]
-; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 [[R_07]], [[TMP7]]
+; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -917,8 +902,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %c = icmp ugt i32 %r.07, %0
   %add = select i1 %c, i32 %r.07, i32 %0
   %inc = add nuw nsw i32 %i.08, 1
@@ -930,7 +915,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define float @fmin_f32(float* nocapture readonly %x, i32 %n) #0 {
+define float @fmin_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @fmin_f32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -940,8 +925,8 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ 1.000000e+03, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = fcmp ult float [[R_07]], [[TMP0]]
 ; CHECK-NEXT:    [[ADD]] = select i1 [[C]], float [[R_07]], float [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
@@ -961,8 +946,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi float [ %add, %for.body ], [ 1000.0, %entry ]
-  %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
-  %0 = load float, float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08
+  %0 = load float, ptr %arrayidx, align 4
   %c = fcmp ult float %r.07, %0
   %add = select i1 %c, float %r.07, float %0
   %inc = add nuw nsw i32 %i.08, 1
@@ -974,7 +959,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret float %r.0.lcssa
 }
 
-define float @fmax_f32(float* nocapture readonly %x, i32 %n) #0 {
+define float @fmax_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @fmax_f32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -984,8 +969,8 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = fcmp fast ugt float [[R_07]], [[TMP0]]
 ; CHECK-NEXT:    [[ADD]] = select i1 [[C]], float [[R_07]], float [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
@@ -1005,8 +990,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi float [ %add, %for.body ], [ 0.0, %entry ]
-  %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
-  %0 = load float, float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08
+  %0 = load float, ptr %arrayidx, align 4
   %c = fcmp fast ugt float %r.07, %0
   %add = select i1 %c, float %r.07, float %0
   %inc = add nuw nsw i32 %i.08, 1
@@ -1018,7 +1003,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret float %r.0.lcssa
 }
 
-define i64 @loopinvariant_mla(i32* nocapture readonly %x, i32 %y, i32 %n) #0 {
+define i64 @loopinvariant_mla(ptr nocapture readonly %x, i32 %y, i32 %n) #0 {
 ; CHECK-LABEL: @loopinvariant_mla(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -1035,8 +1020,8 @@ define i64 @loopinvariant_mla(i32* nocapture readonly %x, i32 %y, i32 %n) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[S_08:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[I_09]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV1]]
 ; CHECK-NEXT:    [[ADD]] = add nsw i64 [[MUL]], [[S_08]]
@@ -1059,8 +1044,8 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 for.body:                                         ; preds = %for.body.lr.ph, %for.body
   %i.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
   %s.08 = phi i64 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.09
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.09
+  %0 = load i32, ptr %arrayidx, align 4
   %conv = sext i32 %0 to i64
   %mul = mul nsw i64 %conv, %conv1
   %add = add nsw i64 %mul, %s.08

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
index bba064213b87..3ffd424f0cb8 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
@@ -10,29 +10,29 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 @a = external global i32
 @v = external global i32
- at mm = external global float**
- at vv = external global float**
- at ll = external global float*
+ at mm = external global ptr
+ at vv = external global ptr
+ at ll = external global ptr
 
-define i32 @test(float* nocapture readonly %x) {
+define i32 @test(ptr nocapture readonly %x) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[T:%.*]] = load i32, i32* @v, align 8
-; CHECK-NEXT:    [[T1:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[T:%.*]] = load i32, ptr @v, align 8
+; CHECK-NEXT:    [[T1:%.*]] = load i32, ptr @a, align 4
 ; CHECK-NEXT:    br label [[OUTERLOOP:%.*]]
 ; CHECK:       outerloop:
 ; CHECK-NEXT:    [[T2:%.*]] = phi i32 [ [[V17:%.*]], [[OUTEREND:%.*]] ], [ [[T1]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[J_0136:%.*]] = phi i32 [ [[INC144:%.*]], [[OUTEREND]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[SCORE_1135:%.*]] = phi i32 [ [[CALL142:%.*]], [[OUTEREND]] ], [ -939524096, [[ENTRY]] ]
-; CHECK-NEXT:    [[T3:%.*]] = load float**, float*** @mm, align 4
-; CHECK-NEXT:    [[ARRAYIDX109:%.*]] = getelementptr inbounds float*, float** [[T3]], i32 [[T2]]
-; CHECK-NEXT:    [[T4:%.*]] = load float*, float** [[ARRAYIDX109]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float**, float*** @vv, align 4
-; CHECK-NEXT:    [[ARRAYIDX111:%.*]] = getelementptr inbounds float*, float** [[T5]], i32 [[T2]]
-; CHECK-NEXT:    [[T6:%.*]] = load float*, float** [[ARRAYIDX111]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float*, float** @ll, align 4
-; CHECK-NEXT:    [[ARRAYIDX113:%.*]] = getelementptr inbounds float, float* [[T7]], i32 [[T2]]
-; CHECK-NEXT:    [[T8:%.*]] = load float, float* [[ARRAYIDX113]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load ptr, ptr @mm, align 4
+; CHECK-NEXT:    [[ARRAYIDX109:%.*]] = getelementptr inbounds ptr, ptr [[T3]], i32 [[T2]]
+; CHECK-NEXT:    [[T4:%.*]] = load ptr, ptr [[ARRAYIDX109]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load ptr, ptr @vv, align 4
+; CHECK-NEXT:    [[ARRAYIDX111:%.*]] = getelementptr inbounds ptr, ptr [[T5]], i32 [[T2]]
+; CHECK-NEXT:    [[T6:%.*]] = load ptr, ptr [[ARRAYIDX111]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load ptr, ptr @ll, align 4
+; CHECK-NEXT:    [[ARRAYIDX113:%.*]] = getelementptr inbounds float, ptr [[T7]], i32 [[T2]]
+; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[ARRAYIDX113]], align 4
 ; CHECK-NEXT:    [[CONV114:%.*]] = fpext float [[T8]] to double
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -43,100 +43,97 @@ define i32 @test(float* nocapture readonly %x) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double>
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[T4]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext <2 x float> [[TMP6]] to <2 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x double> [[TMP7]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[T6]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double>
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP12]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP18:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP16]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP13]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERLOOP]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[INNERLOOP:%.*]]
 ; CHECK:       innerloop:
 ; CHECK-NEXT:    [[I_2132:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC129:%.*]], [[INNERLOOP]] ]
 ; CHECK-NEXT:    [[DVAL1_4131:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB127:%.*]], [[INNERLOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX119:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_2132]]
-; CHECK-NEXT:    [[T9:%.*]] = load float, float* [[ARRAYIDX119]], align 4
-; CHECK-NEXT:    [[ARRAYIDX120:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[I_2132]]
-; CHECK-NEXT:    [[T10:%.*]] = load float, float* [[ARRAYIDX120]], align 4
+; CHECK-NEXT:    [[ARRAYIDX119:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[I_2132]]
+; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[ARRAYIDX119]], align 4
+; CHECK-NEXT:    [[ARRAYIDX120:%.*]] = getelementptr inbounds float, ptr [[T4]], i32 [[I_2132]]
+; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[ARRAYIDX120]], align 4
 ; CHECK-NEXT:    [[SUB121:%.*]] = fsub fast float [[T9]], [[T10]]
 ; CHECK-NEXT:    [[CONV122:%.*]] = fpext float [[SUB121]] to double
 ; CHECK-NEXT:    [[MUL123:%.*]] = fmul fast double [[CONV122]], [[CONV122]]
-; CHECK-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[I_2132]]
-; CHECK-NEXT:    [[T11:%.*]] = load float, float* [[ARRAYIDX124]], align 4
+; CHECK-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds float, ptr [[T6]], i32 [[I_2132]]
+; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[ARRAYIDX124]], align 4
 ; CHECK-NEXT:    [[CONV125:%.*]] = fpext float [[T11]] to double
 ; CHECK-NEXT:    [[MUL126:%.*]] = fmul fast double [[MUL123]], [[CONV125]]
 ; CHECK-NEXT:    [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]]
 ; CHECK-NEXT:    [[INC129]] = add nuw nsw i32 [[I_2132]], 1
 ; CHECK-NEXT:    [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]]
-; CHECK-NEXT:    br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       outerend:
-; CHECK-NEXT:    [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32
 ; CHECK-NEXT:    [[CALL142]] = add nuw nsw i32 [[SCORE_1135]], [[CONV138]]
 ; CHECK-NEXT:    [[INC144]] = add nuw nsw i32 [[J_0136]], 1
-; CHECK-NEXT:    [[ARRAYIDX102:%.*]] = getelementptr inbounds i32, i32* @a, i32 [[INC144]]
-; CHECK-NEXT:    [[V17]] = load i32, i32* [[ARRAYIDX102]], align 4
+; CHECK-NEXT:    [[ARRAYIDX102:%.*]] = getelementptr inbounds i32, ptr @a, i32 [[INC144]]
+; CHECK-NEXT:    [[V17]] = load i32, ptr [[ARRAYIDX102]], align 4
 ; CHECK-NEXT:    [[CMP103:%.*]] = icmp sgt i32 [[V17]], -1
 ; CHECK-NEXT:    br i1 [[CMP103]], label [[OUTERLOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 [[CALL142]]
 ;
 entry:
-  %t = load i32, i32* @v, align 8
-  %t1 = load i32, i32* @a, align 4
+  %t = load i32, ptr @v, align 8
+  %t1 = load i32, ptr @a, align 4
   br label %outerloop
 
 outerloop:
   %t2 = phi i32 [ %v17, %outerend ], [ %t1, %entry ]
   %j.0136 = phi i32 [ %inc144, %outerend ], [ 0, %entry ]
   %score.1135 = phi i32 [ %call142, %outerend ], [ -939524096, %entry ]
-  %t3 = load float**, float*** @mm, align 4
-  %arrayidx109 = getelementptr inbounds float*, float** %t3, i32 %t2
-  %t4 = load float*, float** %arrayidx109, align 4
-  %t5 = load float**, float*** @vv, align 4
-  %arrayidx111 = getelementptr inbounds float*, float** %t5, i32 %t2
-  %t6 = load float*, float** %arrayidx111, align 4
-  %t7 = load float*, float** @ll, align 4
-  %arrayidx113 = getelementptr inbounds float, float* %t7, i32 %t2
-  %t8 = load float, float* %arrayidx113, align 4
+  %t3 = load ptr, ptr @mm, align 4
+  %arrayidx109 = getelementptr inbounds ptr, ptr %t3, i32 %t2
+  %t4 = load ptr, ptr %arrayidx109, align 4
+  %t5 = load ptr, ptr @vv, align 4
+  %arrayidx111 = getelementptr inbounds ptr, ptr %t5, i32 %t2
+  %t6 = load ptr, ptr %arrayidx111, align 4
+  %t7 = load ptr, ptr @ll, align 4
+  %arrayidx113 = getelementptr inbounds float, ptr %t7, i32 %t2
+  %t8 = load float, ptr %arrayidx113, align 4
   %conv114 = fpext float %t8 to double
   br label %innerloop
 
 innerloop:
   %i.2132 = phi i32 [ 0, %outerloop ], [ %inc129, %innerloop ]
   %dval1.4131 = phi double [ %conv114, %outerloop ], [ %sub127, %innerloop ]
-  %arrayidx119 = getelementptr inbounds float, float* %x, i32 %i.2132
-  %t9 = load float, float* %arrayidx119, align 4
-  %arrayidx120 = getelementptr inbounds float, float* %t4, i32 %i.2132
-  %t10 = load float, float* %arrayidx120, align 4
+  %arrayidx119 = getelementptr inbounds float, ptr %x, i32 %i.2132
+  %t9 = load float, ptr %arrayidx119, align 4
+  %arrayidx120 = getelementptr inbounds float, ptr %t4, i32 %i.2132
+  %t10 = load float, ptr %arrayidx120, align 4
   %sub121 = fsub fast float %t9, %t10
   %conv122 = fpext float %sub121 to double
   %mul123 = fmul fast double %conv122, %conv122
-  %arrayidx124 = getelementptr inbounds float, float* %t6, i32 %i.2132
-  %t11 = load float, float* %arrayidx124, align 4
+  %arrayidx124 = getelementptr inbounds float, ptr %t6, i32 %i.2132
+  %t11 = load float, ptr %arrayidx124, align 4
   %conv125 = fpext float %t11 to double
   %mul126 = fmul fast double %mul123, %conv125
   %sub127 = fsub fast double %dval1.4131, %mul126
@@ -149,8 +146,8 @@ outerend:
   %conv138 = fptosi double %sub127.lcssa to i32
   %call142 = add nuw nsw i32 %score.1135, %conv138
   %inc144 = add nuw nsw i32 %j.0136, 1
-  %arrayidx102 = getelementptr inbounds i32, i32* @a, i32 %inc144
-  %v17 = load i32, i32* %arrayidx102, align 4
+  %arrayidx102 = getelementptr inbounds i32, ptr @a, i32 %inc144
+  %v17 = load i32, ptr %arrayidx102, align 4
   %cmp103 = icmp sgt i32 %v17, -1
   br i1 %cmp103, label %outerloop, label %exit
 

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
index 77a154e629fb..6d3ffab0a598 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
@@ -5,7 +5,7 @@
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
-define void @trunc_not_allowed_
diff erent_vec_elemns(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i16* noalias nocapture %D) #0 {
+define void @trunc_not_allowed_
diff erent_vec_elemns(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture %D) #0 {
 ; CHECK-LABEL: @trunc_not_allowed_
diff erent_vec_elemns(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -14,28 +14,24 @@ define void @trunc_not_allowed_
diff erent_vec_elemns(i32* noalias nocapture %A, i
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = shl <4 x i16> [[TMP11]], <i16 1, i16 1, i16 1, i16 1>
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[D:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <4 x i16>*
-; CHECK-NEXT:    store <4 x i16> [[TMP12]], <4 x i16>* [[TMP15]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <4 x i16> [[TMP8]], <i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[D:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP11]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 431, 428
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -46,17 +42,17 @@ define void @trunc_not_allowed_
diff erent_vec_elemns(i32* noalias nocapture %A, i
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_021:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD9:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_021]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_021]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_021]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_021]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_021]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_021]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD_TR:%.*]] = trunc i32 [[ADD]] to i16
 ; CHECK-NEXT:    [[CONV7:%.*]] = shl i16 [[ADD_TR]], 1
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, i16* [[D]], i32 [[I_021]]
-; CHECK-NEXT:    store i16 [[CONV7]], i16* [[ARRAYIDX8]], align 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, ptr [[D]], i32 [[I_021]]
+; CHECK-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX8]], align 2
 ; CHECK-NEXT:    [[ADD9]] = add nuw nsw i32 [[I_021]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD9]], 431
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -69,23 +65,23 @@ for.cond.cleanup:
 
 for.body:
   %i.021 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.021
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.021
-  %1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.021
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.021
+  %1 = load i32, ptr %arrayidx1, align 4
   %add = add nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.021
-  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.021
+  store i32 %add, ptr %arrayidx2, align 4
   %add.tr = trunc i32 %add to i16
   %conv7 = shl i16 %add.tr, 1
-  %arrayidx8 = getelementptr inbounds i16, i16* %D, i32 %i.021
-  store i16 %conv7, i16* %arrayidx8, align 2
+  %arrayidx8 = getelementptr inbounds i16, ptr %D, i32 %i.021
+  store i16 %conv7, ptr %arrayidx8, align 2
   %add9 = add nuw nsw i32 %i.021, 1
   %exitcond = icmp eq i32 %add9, 431
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapture readonly %B, i64* noalias nocapture readonly %C) #0 {
+define void @unsupported_i64_type(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 {
 ; CHECK-LABEL: @unsupported_i64_type(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -93,13 +89,13 @@ define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapt
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[C:%.*]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i32 [[I_09]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[C:%.*]], i32 [[I_09]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i32 [[I_09]]
-; CHECK-NEXT:    store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i32 [[I_09]]
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    [[ADD3]] = add nuw nsw i32 [[I_09]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
@@ -112,19 +108,19 @@ for.cond.cleanup:
 
 for.body:
   %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
-  %arrayidx = getelementptr inbounds i64, i64* %B, i32 %i.09
-  %0 = load i64, i64* %arrayidx, align 8
-  %arrayidx1 = getelementptr inbounds i64, i64* %C, i32 %i.09
-  %1 = load i64, i64* %arrayidx1, align 8
+  %arrayidx = getelementptr inbounds i64, ptr %B, i32 %i.09
+  %0 = load i64, ptr %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds i64, ptr %C, i32 %i.09
+  %1 = load i64, ptr %arrayidx1, align 8
   %add = add nsw i64 %1, %0
-  %arrayidx2 = getelementptr inbounds i64, i64* %A, i32 %i.09
-  store i64 %add, i64* %arrayidx2, align 8
+  %arrayidx2 = getelementptr inbounds i64, ptr %A, i32 %i.09
+  store i64 %add, ptr %arrayidx2, align 8
   %add3 = add nuw nsw i32 %i.09, 1
   %exitcond = icmp eq i32 %add3, 431
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i16* noalias nocapture readonly %C) #0 {
+define void @narrowing_load_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 {
 ; CHECK-LABEL: @narrowing_load_not_allowed(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -133,23 +129,20 @@ define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias no
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[C:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <8 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i8> [[WIDE_LOAD1]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
-; CHECK-NEXT:    store <8 x i8> [[TMP8]], <8 x i8>* [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[C:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 424
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 424
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 431, 424
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -160,14 +153,14 @@ define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias no
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_012:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[C]], i32 [[I_012]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_012]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CONV3:%.*]] = trunc i16 [[TMP13]] to i8
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP14]], [[CONV3]]
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_012]]
-; CHECK-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[C]], i32 [[I_012]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[I_012]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CONV3:%.*]] = trunc i16 [[TMP10]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP11]], [[CONV3]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[I_012]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[ADD6]] = add nuw nsw i32 [[I_012]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD6]], 431
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -180,14 +173,14 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %i.012 = phi i32 [ 0, %entry ], [ %add6, %for.body ]
-  %arrayidx = getelementptr inbounds i16, i16* %C, i32 %i.012
-  %0 = load i16, i16* %arrayidx, align 2
-  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.012
-  %1 = load i8, i8* %arrayidx1, align 1
+  %arrayidx = getelementptr inbounds i16, ptr %C, i32 %i.012
+  %0 = load i16, ptr %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.012
+  %1 = load i8, ptr %arrayidx1, align 1
   %conv3 = trunc i16 %0 to i8
   %add = add i8 %1, %conv3
-  %arrayidx5 = getelementptr inbounds i8, i8* %A, i32 %i.012
-  store i8 %add, i8* %arrayidx5, align 1
+  %arrayidx5 = getelementptr inbounds i8, ptr %A, i32 %i.012
+  store i8 %add, ptr %arrayidx5, align 1
   %add6 = add nuw nsw i32 %i.012, 1
   %exitcond = icmp eq i32 %add6, 431
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
@@ -198,7 +191,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; loop control statements, and thus not affecting element sizes, so
 ; we could allow this case.
 ;
-define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
+define void @trunc_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 {
 ; CHECK-LABEL: @trunc_not_allowed(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -207,22 +200,19 @@ define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 431, 428
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -233,13 +223,13 @@ define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_09]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_09]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_09]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD3]] = add nuw nsw i32 [[I_09]], 1
 ; CHECK-NEXT:    [[ADD_IV:%.*]] = trunc i32 [[ADD3]] to i16
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i16 [[ADD_IV]], 431
@@ -253,13 +243,13 @@ for.cond.cleanup:
 
 for.body:
   %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
-  %1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09
+  %1 = load i32, ptr %arrayidx1, align 4
   %add = add nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
-  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09
+  store i32 %add, ptr %arrayidx2, align 4
   %add3 = add nuw nsw i32 %i.09, 1
 
   %add.iv = trunc i32 %add3 to i16
@@ -271,7 +261,7 @@ for.body:
 ; Test directions for array indices i and N-1. I.e. check strides 1 and -1, and
 ; force vectorisation with a loop hint.
 ;
-define void @strides_
diff erent_direction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) #0 {
+define void @strides_
diff erent_direction(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) #0 {
 ; CHECK-LABEL: @strides_
diff erent_direction(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -280,25 +270,22 @@ define void @strides_
diff erent_direction(i32* noalias nocapture %A, i32* noalias
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i32 [[N:%.*]], [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -3
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[N:%.*]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[REVERSE]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[REVERSE]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 431, 428
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -309,14 +296,14 @@ define void @strides_
diff erent_direction(i32* noalias nocapture %A, i32* noalias
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_09]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[N]], [[I_09]]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[SUB]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[SUB]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_09]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD3]] = add nuw nsw i32 [[I_09]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -329,20 +316,20 @@ for.cond.cleanup:
 
 for.body:
   %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09
+  %0 = load i32, ptr %arrayidx, align 4
   %sub = sub nsw i32 %N, %i.09
-  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %sub
-  %1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %sub
+  %1 = load i32, ptr %arrayidx1, align 4
   %add = add nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
-  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09
+  store i32 %add, ptr %arrayidx2, align 4
   %add3 = add nuw nsw i32 %i.09, 1
   %exitcond = icmp eq i32 %add3, 431
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
 }
 
-define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
+define void @too_many_loop_blocks(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 {
 ; CHECK-LABEL: @too_many_loop_blocks(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -351,22 +338,19 @@ define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapt
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 431, 428
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -377,13 +361,13 @@ define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapt
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[LOOPINCR:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_09]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_09]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_09]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    br label [[LOOPINCR]]
 ; CHECK:       loopincr:
 ; CHECK-NEXT:    [[ADD3]] = add nuw nsw i32 [[I_09]], 1
@@ -398,13 +382,13 @@ for.cond.cleanup:
 
 for.body:
   %i.09 = phi i32 [ 0, %entry ], [ %add3, %loopincr ]
-  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
-  %1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09
+  %1 = load i32, ptr %arrayidx1, align 4
   %add = add nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
-  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09
+  store i32 %add, ptr %arrayidx2, align 4
   br label %loopincr
 
 loopincr:
@@ -413,7 +397,7 @@ loopincr:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-define void @double(double* noalias nocapture %A, double* noalias nocapture readonly %B, double* noalias nocapture readonly %C) #0 {
+define void @double(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 {
 ; CHECK-LABEL: @double(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -421,13 +405,13 @@ define void @double(double* noalias nocapture %A, double* noalias nocapture read
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i32 [[I_09]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 [[I_09]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[C:%.*]], i32 [[I_09]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i32 [[I_09]]
-; CHECK-NEXT:    store double [[ADD]], double* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 [[I_09]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    [[ADD3]] = add nuw nsw i32 [[I_09]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
@@ -440,19 +424,19 @@ for.cond.cleanup:
 
 for.body:
   %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
-  %arrayidx = getelementptr inbounds double, double* %B, i32 %i.09
-  %0 = load double, double* %arrayidx, align 8
-  %arrayidx1 = getelementptr inbounds double, double* %C, i32 %i.09
-  %1 = load double, double* %arrayidx1, align 8
+  %arrayidx = getelementptr inbounds double, ptr %B, i32 %i.09
+  %0 = load double, ptr %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %C, i32 %i.09
+  %1 = load double, ptr %arrayidx1, align 8
   %add = fadd fast double %1, %0
-  %arrayidx2 = getelementptr inbounds double, double* %A, i32 %i.09
-  store double %add, double* %arrayidx2, align 8
+  %arrayidx2 = getelementptr inbounds double, ptr %A, i32 %i.09
+  store double %add, ptr %arrayidx2, align 8
   %add3 = add nuw nsw i32 %i.09, 1
   %exitcond = icmp eq i32 %add3, 431
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C, half* noalias nocapture %D) #0 {
+define void @fptrunc_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture %D) #0 {
 ; CHECK-LABEL: @fptrunc_not_allowed(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -461,28 +445,24 @@ define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias noc
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = fptrunc <4 x float> [[TMP7]] to <4 x half>
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <4 x half> [[TMP11]], <half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000>
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds half, half* [[D:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds half, half* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast half* [[TMP14]] to <4 x half>*
-; CHECK-NEXT:    store <4 x half> [[TMP12]], <4 x half>* [[TMP15]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fptrunc <4 x float> [[TMP5]] to <4 x half>
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <4 x half> [[TMP8]], <half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds half, ptr [[D:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    store <4 x half> [[TMP9]], ptr [[TMP11]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 431, 428
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -493,17 +473,17 @@ define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias noc
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_017:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[I_017]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[C]], i32 [[I_017]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[I_017]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[C]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 [[I_017]]
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = fptrunc float [[ADD]] to half
 ; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast half [[CONV]], 0xH4000
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half* [[D]], i32 [[I_017]]
-; CHECK-NEXT:    store half [[FACTOR]], half* [[ARRAYIDX5]], align 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr [[D]], i32 [[I_017]]
+; CHECK-NEXT:    store half [[FACTOR]], ptr [[ARRAYIDX5]], align 2
 ; CHECK-NEXT:    [[ADD6]] = add nuw nsw i32 [[I_017]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD6]], 431
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -516,17 +496,17 @@ for.cond.cleanup:
 
 for.body:
   %i.017 = phi i32 [ 0, %entry ], [ %add6, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.017
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.017
-  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx = getelementptr inbounds float, ptr %B, i32 %i.017
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %C, i32 %i.017
+  %1 = load float, ptr %arrayidx1, align 4
   %add = fadd fast float %1, %0
-  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.017
-  store float %add, float* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds float, ptr %A, i32 %i.017
+  store float %add, ptr %arrayidx2, align 4
   %conv = fptrunc float %add to half
   %factor = fmul fast half %conv, 0xH4000
-  %arrayidx5 = getelementptr inbounds half, half* %D, i32 %i.017
-  store half %factor, half* %arrayidx5, align 2
+  %arrayidx5 = getelementptr inbounds half, ptr %D, i32 %i.017
+  store half %factor, ptr %arrayidx5, align 2
   %add6 = add nuw nsw i32 %i.017, 1
   %exitcond = icmp eq i32 %add6, 431
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
@@ -537,7 +517,7 @@ for.body:
 ; which aren't supported by the lowoverhead loop pass, causing the tail-predication
 ; to be reverted which is expensive and what we would like to avoid.
 ;
-define dso_local void @select_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N, i32* noalias nocapture readonly %Cond) {
+define dso_local void @select_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N, ptr noalias nocapture readonly %Cond) {
 ; CHECK-LABEL: @select_not_allowed(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -548,31 +528,29 @@ define dso_local void @select_not_allowed(i32* noalias nocapture %A, i32* noalia
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[C:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32*> poison, i32* [[B:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT1]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[C:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32*> [[BROADCAST_SPLAT]], <4 x i32*> [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, <4 x i32*> [[TMP5]], <4 x i32> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x ptr> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, <4 x ptr> [[TMP4]], <4 x i32> [[VEC_IND]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -585,14 +563,14 @@ define dso_local void @select_not_allowed(i32* noalias nocapture %A, i32* noalia
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP11]], 0
-; CHECK-NEXT:    [[C_B:%.*]] = select i1 [[TOBOOL_NOT]], i32* [[C]], i32* [[B]]
-; CHECK-NEXT:    [[COND_IN:%.*]] = getelementptr inbounds i32, i32* [[C_B]], i32 [[I_011]]
-; CHECK-NEXT:    [[COND:%.*]] = load i32, i32* [[COND_IN]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_011]]
-; CHECK-NEXT:    store i32 [[COND]], i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[I_011]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP9]], 0
+; CHECK-NEXT:    [[C_B:%.*]] = select i1 [[TOBOOL_NOT]], ptr [[C]], ptr [[B]]
+; CHECK-NEXT:    [[COND_IN:%.*]] = getelementptr inbounds i32, ptr [[C_B]], i32 [[I_011]]
+; CHECK-NEXT:    [[COND:%.*]] = load i32, ptr [[COND_IN]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_011]]
+; CHECK-NEXT:    store i32 [[COND]], ptr [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_011]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -612,20 +590,20 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %Cond, i32 %i.011
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %Cond, i32 %i.011
+  %0 = load i32, ptr %arrayidx, align 4
   %tobool.not = icmp eq i32 %0, 0
-  %C.B = select i1 %tobool.not, i32* %C, i32* %B
-  %cond.in = getelementptr inbounds i32, i32* %C.B, i32 %i.011
-  %cond = load i32, i32* %cond.in, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %i.011
-  store i32 %cond, i32* %arrayidx3, align 4
+  %C.B = select i1 %tobool.not, ptr %C, ptr %B
+  %cond.in = getelementptr inbounds i32, ptr %C.B, i32 %i.011
+  %cond = load i32, ptr %cond.in, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr %A, i32 %i.011
+  store i32 %cond, ptr %arrayidx3, align 4
   %inc = add nuw nsw i32 %i.011, 1
   %exitcond.not = icmp eq i32 %inc, %N
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 }
 
-define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @i32_smin_reduction(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @i32_smin_reduction(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -639,37 +617,36 @@ define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2147483647, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2147483647, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[R_07]], [[TMP8]]
-; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[R_07]], [[TMP7]]
+; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -682,8 +659,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 2147483647, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %c = icmp slt i32 %r.07, %0
   %add = select i1 %c, i32 %r.07, i32 %0
   %inc = add nuw nsw i32 %i.08, 1
@@ -695,7 +672,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define i32 @i32_smax_reduction(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @i32_smax_reduction(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @i32_smax_reduction(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -709,37 +686,36 @@ define i32 @i32_smax_reduction(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -2147483648, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -2147483648, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[R_07]], [[TMP8]]
-; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[R_07]], [[TMP7]]
+; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -752,8 +728,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ -2147483648, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %c = icmp sgt i32 %r.07, %0
   %add = select i1 %c, i32 %r.07, i32 %0
   %inc = add nuw nsw i32 %i.08, 1
@@ -765,7 +741,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define i32 @i32_umin_reduction(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @i32_umin_reduction(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @i32_umin_reduction(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -779,37 +755,36 @@ define i32 @i32_umin_reduction(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[R_07]], [[TMP8]]
-; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[R_07]], [[TMP7]]
+; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -822,8 +797,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 4294967295, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %c = icmp ult i32 %r.07, %0
   %add = select i1 %c, i32 %r.07, i32 %0
   %inc = add nuw nsw i32 %i.08, 1
@@ -835,7 +810,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret i32 %r.0.lcssa
 }
 
-define i32 @i32_umax_reduction(i32* nocapture readonly %x, i32 %n) #0 {
+define i32 @i32_umax_reduction(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-LABEL: @i32_umax_reduction(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -849,37 +824,36 @@ define i32 @i32_umax_reduction(i32* nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 [[R_07]], [[TMP8]]
-; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 [[R_07]], [[TMP7]]
+; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -892,8 +866,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
+  %0 = load i32, ptr %arrayidx, align 4
   %c = icmp ugt i32 %r.07, %0
   %add = select i1 %c, i32 %r.07, i32 %0
   %inc = add nuw nsw i32 %i.08, 1

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
index 7bb804eb73e9..b3157b75bc26 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=loop-vectorize -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   -force-target-max-vector-interleave=1 -mcpu=pwr9 < %s | FileCheck %s
-define dso_local void @test(i32* %Arr, i32 signext %Len) {
+define dso_local void @test(ptr %Arr, i32 signext %Len) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 0, [[LEN:%.*]]
@@ -17,19 +17,17 @@ define dso_local void @test(i32* %Arr, i32 signext %Len) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[LEN]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
@@ -43,12 +41,12 @@ define dso_local void @test(i32* %Arr, i32 signext %Len) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I_02]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP9]])
 ; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[I_02]] to i64
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 [[IDXPROM1]]
-; CHECK-NEXT:    store i32 [[TMP12]], i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_02]], 1
@@ -73,12 +71,12 @@ for.cond.cleanup:                                 ; preds = %for.cond.for.cond.c
 for.body:                                         ; preds = %for.body.lr.ph, %for.inc
   %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
   %idxprom = sext i32 %i.02 to i64
-  %arrayidx = getelementptr inbounds i32, i32* %Arr, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %Arr, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
   %1 = call i32 @llvm.bswap.i32(i32 %0)
   %idxprom1 = sext i32 %i.02 to i64
-  %arrayidx2 = getelementptr inbounds i32, i32* %Arr, i64 %idxprom1
-  store i32 %1, i32* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %Arr, i64 %idxprom1
+  store i32 %1, ptr %arrayidx2, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 841d7c7dbbdc..7fc51482acf4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -6,7 +6,7 @@
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
 
-define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) {
+define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; OUTLOOP-LABEL: @add_i16_i32(
 ; OUTLOOP-NEXT:  entry:
 ; OUTLOOP-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -24,54 +24,52 @@ define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) {
 ; OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; OUTLOOP:       vector.body:
 ; OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; OUTLOOP-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; OUTLOOP-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
 ; OUTLOOP-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
 ; OUTLOOP-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
 ; OUTLOOP-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 0
 ; OUTLOOP-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 1
 ; OUTLOOP-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]]
-; OUTLOOP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[TMP4]]
-; OUTLOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[TMP9]]
-; OUTLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 0
-; OUTLOOP-NEXT:    [[TMP13:%.*]] = bitcast i16* [[TMP12]] to <vscale x 2 x i16>*
-; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, <vscale x 2 x i16>* [[TMP13]], align 2
-; OUTLOOP-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; OUTLOOP-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 2
-; OUTLOOP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 [[TMP15]]
-; OUTLOOP-NEXT:    [[TMP17:%.*]] = bitcast i16* [[TMP16]] to <vscale x 2 x i16>*
-; OUTLOOP-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i16>, <vscale x 2 x i16>* [[TMP17]], align 2
-; OUTLOOP-NEXT:    [[TMP18:%.*]] = sext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i32>
-; OUTLOOP-NEXT:    [[TMP19:%.*]] = sext <vscale x 2 x i16> [[WIDE_LOAD2]] to <vscale x 2 x i32>
-; OUTLOOP-NEXT:    [[TMP20]] = add <vscale x 2 x i32> [[VEC_PHI]], [[TMP18]]
-; OUTLOOP-NEXT:    [[TMP21]] = add <vscale x 2 x i32> [[VEC_PHI1]], [[TMP19]]
-; OUTLOOP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; OUTLOOP-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP22]], 4
-; OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP23]]
-; OUTLOOP-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; OUTLOOP-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; OUTLOOP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]]
+; OUTLOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[TMP9]]
+; OUTLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
+; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[TMP12]], align 2
+; OUTLOOP-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; OUTLOOP-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 2
+; OUTLOOP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 [[TMP14]]
+; OUTLOOP-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i16>, ptr [[TMP15]], align 2
+; OUTLOOP-NEXT:    [[TMP16:%.*]] = sext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i32>
+; OUTLOOP-NEXT:    [[TMP17:%.*]] = sext <vscale x 2 x i16> [[WIDE_LOAD2]] to <vscale x 2 x i32>
+; OUTLOOP-NEXT:    [[TMP18]] = add <vscale x 2 x i32> [[VEC_PHI]], [[TMP16]]
+; OUTLOOP-NEXT:    [[TMP19]] = add <vscale x 2 x i32> [[VEC_PHI1]], [[TMP17]]
+; OUTLOOP-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
+; OUTLOOP-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], 4
+; OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP21]]
+; OUTLOOP-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; OUTLOOP-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; OUTLOOP:       middle.block:
-; OUTLOOP-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[TMP21]], [[TMP20]]
-; OUTLOOP-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; OUTLOOP-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[TMP19]], [[TMP18]]
+; OUTLOOP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
 ; OUTLOOP-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; OUTLOOP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; OUTLOOP:       scalar.ph:
 ; OUTLOOP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; OUTLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; OUTLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
 ; OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
 ; OUTLOOP:       for.body:
 ; OUTLOOP-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; OUTLOOP-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; OUTLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]]
-; OUTLOOP-NEXT:    [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; OUTLOOP-NEXT:    [[CONV:%.*]] = sext i16 [[TMP26]] to i32
+; OUTLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
+; OUTLOOP-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; OUTLOOP-NEXT:    [[CONV:%.*]] = sext i16 [[TMP24]] to i32
 ; OUTLOOP-NEXT:    [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
 ; OUTLOOP-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; OUTLOOP-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; OUTLOOP-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; OUTLOOP:       for.cond.cleanup.loopexit:
-; OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
 ; OUTLOOP-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; OUTLOOP:       for.cond.cleanup:
 ; OUTLOOP-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -94,37 +92,35 @@ define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) {
 ; INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INLOOP:       vector.body:
 ; INLOOP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; INLOOP-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; INLOOP-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
 ; INLOOP-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
 ; INLOOP-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
 ; INLOOP-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 4
 ; INLOOP-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 0
 ; INLOOP-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 1
 ; INLOOP-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]]
-; INLOOP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[TMP4]]
-; INLOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[TMP9]]
-; INLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 0
-; INLOOP-NEXT:    [[TMP13:%.*]] = bitcast i16* [[TMP12]] to <vscale x 4 x i16>*
-; INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* [[TMP13]], align 2
-; INLOOP-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; INLOOP-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; INLOOP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 [[TMP15]]
-; INLOOP-NEXT:    [[TMP17:%.*]] = bitcast i16* [[TMP16]] to <vscale x 4 x i16>*
-; INLOOP-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* [[TMP17]], align 2
-; INLOOP-NEXT:    [[TMP18:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; INLOOP-NEXT:    [[TMP19:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; INLOOP-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; INLOOP-NEXT:    [[TMP21]] = add i32 [[TMP20]], [[VEC_PHI]]
-; INLOOP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; INLOOP-NEXT:    [[TMP23]] = add i32 [[TMP22]], [[VEC_PHI1]]
-; INLOOP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; INLOOP-NEXT:    [[TMP25:%.*]] = mul i32 [[TMP24]], 8
-; INLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP25]]
-; INLOOP-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; INLOOP-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INLOOP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]]
+; INLOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[TMP9]]
+; INLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
+; INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP12]], align 2
+; INLOOP-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; INLOOP-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 4
+; INLOOP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 [[TMP14]]
+; INLOOP-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, ptr [[TMP15]], align 2
+; INLOOP-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; INLOOP-NEXT:    [[TMP17:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; INLOOP-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; INLOOP-NEXT:    [[TMP19]] = add i32 [[TMP18]], [[VEC_PHI]]
+; INLOOP-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; INLOOP-NEXT:    [[TMP21]] = add i32 [[TMP20]], [[VEC_PHI1]]
+; INLOOP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; INLOOP-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP22]], 8
+; INLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP23]]
+; INLOOP-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INLOOP-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; INLOOP:       middle.block:
-; INLOOP-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP21]]
+; INLOOP-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP21]], [[TMP19]]
 ; INLOOP-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; INLOOP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; INLOOP:       scalar.ph:
@@ -134,9 +130,9 @@ define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) {
 ; INLOOP:       for.body:
 ; INLOOP-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; INLOOP-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; INLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]]
-; INLOOP-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; INLOOP-NEXT:    [[CONV:%.*]] = sext i16 [[TMP27]] to i32
+; INLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
+; INLOOP-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; INLOOP-NEXT:    [[CONV:%.*]] = sext i16 [[TMP25]] to i32
 ; INLOOP-NEXT:    [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
 ; INLOOP-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; INLOOP-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
@@ -155,8 +151,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.08
-  %0 = load i16, i16* %arrayidx, align 2
+  %arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.08
+  %0 = load i16, ptr %arrayidx, align 2
   %conv = sext i16 %0 to i32
   %add = add nsw i32 %r.07, %conv
   %inc = add nuw nsw i32 %i.08, 1

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 0c914d670f55..1046f4cda4cd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -3,7 +3,7 @@
 
 target triple = "riscv64"
 
-define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 {
+define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
 ; CHECK-LABEL: @trip5_i8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -24,21 +24,18 @@ define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP8]], i64 5)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <vscale x 8 x i8>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8>* [[TMP11]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = shl <vscale x 8 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <vscale x 8 x i8>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8>* [[TMP15]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
-; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 8 x i8> [[TMP12]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP14]] to <vscale x 8 x i8>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0nxv8i8(<vscale x 8 x i8> [[TMP16]], <vscale x 8 x i8>* [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP19]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP10]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = shl <vscale x 8 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP13]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 8 x i8> [[TMP11]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP13]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -47,13 +44,13 @@ define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP20]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP21]]
-; CHECK-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP17]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP18]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -65,13 +62,13 @@ entry:
 
 for.body:                                         ; preds = %entry, %for.body
   %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %src, i64 %i.08
-  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
   %mul = shl i8 %0, 1
-  %arrayidx1 = getelementptr inbounds i8, i8* %dst, i64 %i.08
-  %1 = load i8, i8* %arrayidx1, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
   %add = add i8 %mul, %1
-  store i8 %add, i8* %arrayidx1, align 1
+  store i8 %add, ptr %arrayidx1, align 1
   %inc = add nuw nsw i64 %i.08, 1
   %exitcond.not = icmp eq i64 %inc, 5
   br i1 %exitcond.not, label %for.end, label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 24f75e0a9311..4df17b95d849 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -4,7 +4,7 @@
 
 ; The source code:
 ;
-;void foo4(double *A, double *B, int *trigger) {
+;void foo4(ptr A, ptr B, int *trigger) {
 ;
 ;  for (int i=0; i<10000; i += 16) {
 ;    if (trigger[i] < 100) {
@@ -13,30 +13,24 @@
 ;  }
 ;}
 
-define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
+define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
 ; RV32-LABEL: @foo4(
 ; RV32-NEXT:  entry:
-; RV32-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
-; RV32-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
-; RV32-NEXT:    [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
 ; RV32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]])
 ; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]]
 ; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; RV32:       vector.memcheck:
-; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985
-; RV32-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; RV32-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 9985
-; RV32-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; RV32-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 19969
-; RV32-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
+; RV32-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
+; RV32-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
+; RV32-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
+; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]]
+; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]]
 ; RV32-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; RV32-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
-; RV32-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
-; RV32-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; RV32-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; RV32-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]]
+; RV32-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]]
+; RV32-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; RV32-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; RV32:       vector.ph:
 ; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
@@ -55,16 +49,16 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; RV32:       vector.body:
 ; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <vscale x 1 x i64> [[VEC_IND]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0i32(<vscale x 1 x i32*> [[TMP8]], i32 4, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i32> poison), !alias.scope !0
+; RV32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 1 x i64> [[VEC_IND]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> [[TMP8]], i32 4, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i32> poison), !alias.scope !0
 ; RV32-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 1 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 100, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
 ; RV32-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[B]], <vscale x 1 x i64> [[TMP10]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0f64(<vscale x 1 x double*> [[TMP11]], i32 8, <vscale x 1 x i1> [[TMP9]], <vscale x 1 x double> poison), !alias.scope !3
+; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 1 x i64> [[TMP10]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> [[TMP11]], i32 8, <vscale x 1 x i1> [[TMP9]], <vscale x 1 x double> poison), !alias.scope !3
 ; RV32-NEXT:    [[TMP12:%.*]] = sitofp <vscale x 1 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 1 x double>
-; RV32-NEXT:    [[TMP13:%.*]] = fadd <vscale x 1 x double> [[WIDE_MASKED_GATHER12]], [[TMP12]]
-; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[A]], <vscale x 1 x i64> [[VEC_IND]]
-; RV32-NEXT:    call void @llvm.masked.scatter.nxv1f64.nxv1p0f64(<vscale x 1 x double> [[TMP13]], <vscale x 1 x double*> [[TMP14]], i32 8, <vscale x 1 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
+; RV32-NEXT:    [[TMP13:%.*]] = fadd <vscale x 1 x double> [[WIDE_MASKED_GATHER6]], [[TMP12]]
+; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 1 x i64> [[VEC_IND]]
+; RV32-NEXT:    call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> [[TMP13]], <vscale x 1 x ptr> [[TMP14]], i32 8, <vscale x 1 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
 ; RV32-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
 ; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -78,18 +72,18 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; RV32-NEXT:    br label [[FOR_BODY:%.*]]
 ; RV32:       for.body:
 ; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; RV32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; RV32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; RV32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
 ; RV32-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; RV32:       if.then:
 ; RV32-NEXT:    [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; RV32-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP18]]
-; RV32-NEXT:    [[TMP19:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; RV32-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP18]]
+; RV32-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
 ; RV32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP17]] to double
 ; RV32-NEXT:    [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]]
-; RV32-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; RV32-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; RV32-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; RV32-NEXT:    store double [[ADD]], ptr [[ARRAYIDX7]], align 8
 ; RV32-NEXT:    br label [[FOR_INC]]
 ; RV32:       for.inc:
 ; RV32-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
@@ -100,27 +94,21 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ;
 ; RV64-LABEL: @foo4(
 ; RV64-NEXT:  entry:
-; RV64-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
-; RV64-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
-; RV64-NEXT:    [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
 ; RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]])
 ; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]]
 ; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; RV64:       vector.memcheck:
-; RV64-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985
-; RV64-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; RV64-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 9985
-; RV64-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; RV64-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 19969
-; RV64-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; RV64-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; RV64-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
+; RV64-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
+; RV64-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
+; RV64-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
+; RV64-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]]
+; RV64-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]]
 ; RV64-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; RV64-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
-; RV64-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
-; RV64-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; RV64-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; RV64-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]]
+; RV64-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]]
+; RV64-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; RV64-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; RV64-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; RV64:       vector.ph:
 ; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
@@ -139,16 +127,16 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; RV64:       vector.body:
 ; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <vscale x 1 x i64> [[VEC_IND]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0i32(<vscale x 1 x i32*> [[TMP8]], i32 4, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i32> poison), !alias.scope !0
+; RV64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 1 x i64> [[VEC_IND]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> [[TMP8]], i32 4, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i32> poison), !alias.scope !0
 ; RV64-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 1 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 100, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
 ; RV64-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 1 x i64> [[VEC_IND]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[B]], <vscale x 1 x i64> [[TMP10]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0f64(<vscale x 1 x double*> [[TMP11]], i32 8, <vscale x 1 x i1> [[TMP9]], <vscale x 1 x double> poison), !alias.scope !3
+; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 1 x i64> [[TMP10]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> [[TMP11]], i32 8, <vscale x 1 x i1> [[TMP9]], <vscale x 1 x double> poison), !alias.scope !3
 ; RV64-NEXT:    [[TMP12:%.*]] = sitofp <vscale x 1 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 1 x double>
-; RV64-NEXT:    [[TMP13:%.*]] = fadd <vscale x 1 x double> [[WIDE_MASKED_GATHER12]], [[TMP12]]
-; RV64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[A]], <vscale x 1 x i64> [[VEC_IND]]
-; RV64-NEXT:    call void @llvm.masked.scatter.nxv1f64.nxv1p0f64(<vscale x 1 x double> [[TMP13]], <vscale x 1 x double*> [[TMP14]], i32 8, <vscale x 1 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
+; RV64-NEXT:    [[TMP13:%.*]] = fadd <vscale x 1 x double> [[WIDE_MASKED_GATHER6]], [[TMP12]]
+; RV64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 1 x i64> [[VEC_IND]]
+; RV64-NEXT:    call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> [[TMP13]], <vscale x 1 x ptr> [[TMP14]], i32 8, <vscale x 1 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
 ; RV64-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
 ; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -162,18 +150,18 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; RV64-NEXT:    br label [[FOR_BODY:%.*]]
 ; RV64:       for.body:
 ; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; RV64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
 ; RV64-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; RV64:       if.then:
 ; RV64-NEXT:    [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; RV64-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP18]]
-; RV64-NEXT:    [[TMP19:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; RV64-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP18]]
+; RV64-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
 ; RV64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP17]] to double
 ; RV64-NEXT:    [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]]
-; RV64-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; RV64-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; RV64-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; RV64-NEXT:    store double [[ADD]], ptr [[ARRAYIDX7]], align 8
 ; RV64-NEXT:    br label [[FOR_INC]]
 ; RV64:       for.inc:
 ; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
@@ -187,19 +175,19 @@ entry:
 
 for.body:                                         ; preds = %entry, %for.inc
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
   %cmp1 = icmp slt i32 %0, 100
   br i1 %cmp1, label %if.then, label %for.inc
 
 if.then:                                          ; preds = %for.body
   %1 = shl nuw nsw i64 %indvars.iv, 1
-  %arrayidx3 = getelementptr inbounds double, double* %B, i64 %1
-  %2 = load double, double* %arrayidx3, align 8
+  %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %1
+  %2 = load double, ptr %arrayidx3, align 8
   %conv = sitofp i32 %0 to double
   %add = fadd double %2, %conv
-  %arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv
-  store double %add, double* %arrayidx7, align 8
+  %arrayidx7 = getelementptr inbounds double, ptr %A, i64 %indvars.iv
+  store double %add, ptr %arrayidx7, align 8
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
index 993d429a1760..4e9ec86df7ca 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
@@ -5,7 +5,7 @@
 ; RUN: opt < %s -passes=loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -scalable-vectorization=off -riscv-v-register-bit-width-lmul=2 -S | FileCheck %s --check-prefix=LMUL2
 
 ; Function Attrs: nounwind
-define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
+define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr %c, i32 %size) {
 ; LMUL1-LABEL: @array_add(
 ; LMUL1-NEXT:  entry:
 ; LMUL1-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0
@@ -23,22 +23,19 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur
 ; LMUL1:       vector.body:
 ; LMUL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; LMUL1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; LMUL1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
-; LMUL1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; LMUL1-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; LMUL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
-; LMUL1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; LMUL1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; LMUL1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; LMUL1-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; LMUL1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]]
-; LMUL1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; LMUL1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; LMUL1-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4
+; LMUL1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; LMUL1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; LMUL1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
+; LMUL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; LMUL1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; LMUL1-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; LMUL1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]]
+; LMUL1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; LMUL1-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP10]], align 4
 ; LMUL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; LMUL1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; LMUL1-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; LMUL1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; LMUL1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; LMUL1:       middle.block:
 ; LMUL1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; LMUL1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -47,13 +44,13 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur
 ; LMUL1-NEXT:    br label [[FOR_BODY:%.*]]
 ; LMUL1:       for.body:
 ; LMUL1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; LMUL1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; LMUL1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; LMUL1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; LMUL1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; LMUL1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
-; LMUL1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; LMUL1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; LMUL1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; LMUL1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; LMUL1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; LMUL1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; LMUL1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; LMUL1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; LMUL1-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; LMUL1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; LMUL1-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; LMUL1-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
@@ -61,7 +58,7 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur
 ; LMUL1:       for.end.loopexit:
 ; LMUL1-NEXT:    br label [[FOR_END]]
 ; LMUL1:       for.end:
-; LMUL1-NEXT:    ret i32* [[C]]
+; LMUL1-NEXT:    ret ptr [[C]]
 ;
 ; LMUL2-LABEL: @array_add(
 ; LMUL2-NEXT:  entry:
@@ -80,22 +77,19 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur
 ; LMUL2:       vector.body:
 ; LMUL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; LMUL2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; LMUL2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
-; LMUL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; LMUL2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4
-; LMUL2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
-; LMUL2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; LMUL2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
-; LMUL2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4
-; LMUL2-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; LMUL2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]]
-; LMUL2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; LMUL2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
-; LMUL2-NEXT:    store <8 x i32> [[TMP10]], <8 x i32>* [[TMP13]], align 4
+; LMUL2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; LMUL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
+; LMUL2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
+; LMUL2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; LMUL2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
+; LMUL2-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; LMUL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]]
+; LMUL2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; LMUL2-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP10]], align 4
 ; LMUL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; LMUL2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; LMUL2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; LMUL2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; LMUL2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; LMUL2:       middle.block:
 ; LMUL2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; LMUL2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -104,13 +98,13 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur
 ; LMUL2-NEXT:    br label [[FOR_BODY:%.*]]
 ; LMUL2:       for.body:
 ; LMUL2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; LMUL2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; LMUL2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; LMUL2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; LMUL2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; LMUL2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
-; LMUL2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; LMUL2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; LMUL2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; LMUL2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; LMUL2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; LMUL2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; LMUL2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; LMUL2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; LMUL2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; LMUL2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; LMUL2-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; LMUL2-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
@@ -118,7 +112,7 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur
 ; LMUL2:       for.end.loopexit:
 ; LMUL2-NEXT:    br label [[FOR_END]]
 ; LMUL2:       for.end:
-; LMUL2-NEXT:    ret i32* [[C]]
+; LMUL2-NEXT:    ret ptr [[C]]
 ;
 entry:
   %cmp10 = icmp sgt i32 %size, 0
@@ -129,13 +123,13 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
   %add = add nsw i32 %1, %0
-  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx4, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx4, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %size
@@ -145,5 +139,5 @@ for.end.loopexit:                                 ; preds = %for.body
   br label %for.end
 
 for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret i32* %c
+  ret ptr %c
 }

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index 24b692c12172..357f8b4fe751 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=riscv64 -mattr=+v -passes=loop-vectorize < %s | FileCheck %s
 
-define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_range(4,1024) {
+define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_range(4,1024) {
 ; CHECK-LABEL: @small_trip_count_min_vlen_128(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -10,13 +10,11 @@ define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_ra
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -27,10 +25,10 @@ define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_ra
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[IV]]
-; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP]], align 4
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
 ; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -42,10 +40,10 @@ entry:
 
 loop:
   %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, i32* %a, i32 %iv
-  %v = load i32, i32* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr %a, i32 %iv
+  %v = load i32, ptr %gep, align 4
   %add = add nsw i32 %v, 1
-  store i32 %add, i32* %gep, align 4
+  store i32 %add, ptr %gep, align 4
   %iv.next = add i32 %iv, 1
   %cond = icmp eq i32 %iv, 3
   br i1 %cond, label %exit, label %loop
@@ -56,7 +54,7 @@ exit:
 
 ; Note: This test uses a vscale_range starting at 1, which is technically incompatibile
 ; with +v.  If we expose a target hook for minimum vlen, this example will need reworked
-define void @small_trip_count_min_vlen_32(i32* nocapture %a) nounwind vscale_range(1,1024) {
+define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_range(1,1024) {
 ; CHECK-LABEL: @small_trip_count_min_vlen_32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
@@ -77,18 +75,16 @@ define void @small_trip_count_min_vlen_32(i32* nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP8]], i32 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 2 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0nxv2i32(<vscale x 2 x i32>* [[TMP11]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i32> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 2 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP10]] to <vscale x 2 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i32.p0nxv2i32(<vscale x 2 x i32> [[TMP12]], <vscale x 2 x i32>* [[TMP13]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP10]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i32> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 2 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP11]], ptr [[TMP10]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -96,10 +92,10 @@ define void @small_trip_count_min_vlen_32(i32* nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[IV]]
-; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP]], align 4
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
 ; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -111,10 +107,10 @@ entry:
 
 loop:
   %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, i32* %a, i32 %iv
-  %v = load i32, i32* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr %a, i32 %iv
+  %v = load i32, ptr %gep, align 4
   %add = add nsw i32 %v, 1
-  store i32 %add, i32* %gep, align 4
+  store i32 %add, ptr %gep, align 4
   %iv.next = add i32 %iv, 1
   %cond = icmp eq i32 %iv, 3
   br i1 %cond, label %exit, label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index bf3d5d35f120..648ba0e45a6c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -7,7 +7,7 @@
 %rec8 = type { i16 }
 
 @a = global [1 x %rec8] zeroinitializer
- at b = global [2 x i16*] zeroinitializer
+ at b = global [2 x ptr] zeroinitializer
 
 
 define void @f1() {
@@ -21,12 +21,11 @@ define void @f1() {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>*
-; CHECK-NEXT:    store <2 x i16*> <i16* getelementptr inbounds ([1 x %rec8], [1 x %rec8]* @a, i32 0, i32 0, i32 0), i16* getelementptr inbounds ([1 x %rec8], [1 x %rec8]* @a, i32 0, i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 2, 2
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]]
@@ -36,14 +35,13 @@ define void @f1() {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ]
 ; CHECK-NEXT:    [[_TMP1:%.*]] = zext i16 0 to i64
-; CHECK-NEXT:    [[_TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 [[_TMP1]]
-; CHECK-NEXT:    [[_TMP4:%.*]] = bitcast %rec8* [[_TMP2]] to i16*
+; CHECK-NEXT:    [[_TMP2:%.*]] = getelementptr [1 x %rec8], ptr @a, i16 0, i64 [[_TMP1]]
 ; CHECK-NEXT:    [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64
-; CHECK-NEXT:    [[_TMP7:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[_TMP6]]
-; CHECK-NEXT:    store i16* [[_TMP4]], i16** [[_TMP7]], align 8
+; CHECK-NEXT:    [[_TMP7:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[_TMP6]]
+; CHECK-NEXT:    store ptr [[_TMP2]], ptr [[_TMP7]], align 8
 ; CHECK-NEXT:    [[_TMP9]] = add nsw i16 [[C_1_0]], 1
 ; CHECK-NEXT:    [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
-; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void
 ;
@@ -54,11 +52,10 @@ bb1:
 bb2:
   %c.1.0 = phi i16 [ 0, %bb1 ], [ %_tmp9, %bb2 ]
   %_tmp1 = zext i16 0 to i64
-  %_tmp2 = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 %_tmp1
-  %_tmp4 = bitcast %rec8* %_tmp2 to i16*
+  %_tmp2 = getelementptr [1 x %rec8], ptr @a, i16 0, i64 %_tmp1
   %_tmp6 = sext i16 %c.1.0 to i64
-  %_tmp7 = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 %_tmp6
-  store i16* %_tmp4, i16** %_tmp7
+  %_tmp7 = getelementptr [2 x ptr], ptr @b, i16 0, i64 %_tmp6
+  store ptr %_tmp2, ptr %_tmp7
   %_tmp9 = add nsw i16 %c.1.0, 1
   %_tmp11 = icmp slt i16 %_tmp9, 2
   br i1 %_tmp11, label %bb2, label %bb3

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 3bf4d6ff6632..1554ce97e250 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
 ; CHECK-LABEL: @conversion_cost1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -21,17 +21,16 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <32 x i8> [ <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i8> [ <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <32 x i8>*
-; CHECK-NEXT:    store <32 x i8> [[VEC_IND1]], <32 x i8>* [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <32 x i8> [[VEC_IND]], ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <32 x i8> [[VEC_IND1]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i8> [[VEC_IND]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -40,9 +39,9 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[INDVARS_IV]] to i8
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i8 [[TMP10]], i8* [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[INDVARS_IV]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
@@ -58,8 +57,8 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
 .lr.ph:                                           ; preds = %0, %.lr.ph
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ]
   %2 = trunc i64 %indvars.iv to i8
-  %3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
-  store i8 %2, i8* %3, align 1
+  %3 = getelementptr inbounds i8, ptr %A, i64 %indvars.iv
+  store i8 %2, ptr %3, align 1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %n
@@ -69,7 +68,7 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
   ret i32 undef
 }
 
-define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
 ; CHECK-LABEL: @conversion_cost2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 9
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -92,37 +91,33 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun
 ; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <2 x i64> [[STEP_ADD1]], <i64 2, i64 2>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 9, [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[VEC_IND]], <i64 3, i64 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[STEP_ADD]], <i64 3, i64 3>
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], <i64 3, i64 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], <i64 3, i64 3>
-; CHECK-NEXT:    [[TMP17:%.*]] = sitofp <2 x i64> [[TMP13]] to <2 x float>
-; CHECK-NEXT:    [[TMP18:%.*]] = sitofp <2 x i64> [[TMP14]] to <2 x float>
-; CHECK-NEXT:    [[TMP19:%.*]] = sitofp <2 x i64> [[TMP15]] to <2 x float>
-; CHECK-NEXT:    [[TMP20:%.*]] = sitofp <2 x i64> [[TMP16]] to <2 x float>
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP17]], <2 x float>* [[TMP26]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 2
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP27]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP18]], <2 x float>* [[TMP28]], align 4
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 4
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast float* [[TMP29]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP19]], <2 x float>* [[TMP30]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 6
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast float* [[TMP31]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP20]], <2 x float>* [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[VEC_IND]], <i64 3, i64 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i64> [[STEP_ADD]], <i64 3, i64 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], <i64 3, i64 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], <i64 3, i64 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = sitofp <2 x i64> [[TMP9]] to <2 x float>
+; CHECK-NEXT:    [[TMP14:%.*]] = sitofp <2 x i64> [[TMP10]] to <2 x float>
+; CHECK-NEXT:    [[TMP15:%.*]] = sitofp <2 x i64> [[TMP11]] to <2 x float>
+; CHECK-NEXT:    [[TMP16:%.*]] = sitofp <2 x i64> [[TMP12]] to <2 x float>
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    store <2 x float> [[TMP13]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 2
+; CHECK-NEXT:    store <2 x float> [[TMP14]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 4
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 6
+; CHECK-NEXT:    store <2 x float> [[TMP16]], ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD2]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -133,8 +128,8 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[TOFP:%.*]] = sitofp i64 [[ADD]] to float
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[TOFP]], float* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TOFP]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
@@ -151,8 +146,8 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
   %add = add nsw i64 %indvars.iv, 3
   %tofp = sitofp i64 %add to float
-  %gep = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %tofp, float* %gep, align 4
+  %gep = getelementptr inbounds float, ptr %B, i64 %indvars.iv
+  store float %tofp, ptr %gep, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %n

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
index 57e50b90c3ff..8e3af54b770e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
@@ -11,79 +11,29 @@
 target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-w64-windows-gnu"
 
-define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 {
+define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 {
 ; CHECK-LABEL: @cff_index_load_offsets(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 undef, i64 4)
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* null, i64 [[TMP3]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw <4 x i32> [[TMP4]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw <4 x i32> [[TMP5]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i8> poison, i8 [[TMP8]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT5]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT6]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT6]] to <4 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw <4 x i32> [[TMP9]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP11]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i8> poison, i8 [[TMP15]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT9]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP17]], [[TMP19]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i32 3
-; CHECK-NEXT:    store i32 [[TMP22]], i32* undef, align 4, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ null, [[IF_THEN]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY68:%.*]]
 ; CHECK:       for.body68:
-; CHECK-NEXT:    [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[CONV70:%.*]] = zext i8 [[X]] to i32
+; CHECK-NEXT:    [[P_359:%.*]] = phi ptr [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ null, [[IF_THEN]] ]
+; CHECK-NEXT:    [[CONV70:%.*]] = zext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24
-; CHECK-NEXT:    [[TMP24:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
 ; CHECK-NEXT:    [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
 ; CHECK-NEXT:    [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
-; CHECK-NEXT:    [[CONV81:%.*]] = zext i8 [[TMP25]] to i32
+; CHECK-NEXT:    [[CONV81:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]]
-; CHECK-NEXT:    store i32 [[OR83]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[ADD_PTR86]] = getelementptr inbounds i8, i8* [[P_359]], i64 4
-; CHECK-NEXT:    [[CMP66:%.*]] = icmp ult i8* [[ADD_PTR86]], undef
-; CHECK-NEXT:    br i1 [[CMP66]], label [[FOR_BODY68]], label [[SW_EPILOG]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    store i32 [[OR83]], ptr undef, align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[ADD_PTR86]] = getelementptr inbounds i8, ptr [[P_359]], i64 4
+; CHECK-NEXT:    [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], undef
+; CHECK-NEXT:    br i1 [[CMP66]], label [[FOR_BODY68]], label [[SW_EPILOG:%.*]]
 ; CHECK:       sw.epilog:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       Exit:
@@ -96,21 +46,21 @@ if.then:                                          ; preds = %entry
   br label %for.body68
 
 for.body68:                                       ; preds = %for.body68, %if.then
-  %p.359 = phi i8* [ %add.ptr86, %for.body68 ], [ null, %if.then ]
+  %p.359 = phi ptr [ %add.ptr86, %for.body68 ], [ null, %if.then ]
   %conv70 = zext i8 %x to i32
   %shl71 = shl nuw i32 %conv70, 24
-  %0 = load i8, i8* %p, align 1, !tbaa !1
+  %0 = load i8, ptr %p, align 1, !tbaa !1
   %conv73 = zext i8 %0 to i32
   %shl74 = shl nuw nsw i32 %conv73, 16
   %or75 = or i32 %shl74, %shl71
-  %1 = load i8, i8* undef, align 1, !tbaa !1
+  %1 = load i8, ptr undef, align 1, !tbaa !1
   %shl78 = shl nuw nsw i32 undef, 8
   %or79 = or i32 %or75, %shl78
   %conv81 = zext i8 %1 to i32
   %or83 = or i32 %or79, %conv81
-  store i32 %or83, i32* undef, align 4, !tbaa !4
-  %add.ptr86 = getelementptr inbounds i8, i8* %p.359, i64 4
-  %cmp66 = icmp ult i8* %add.ptr86, undef
+  store i32 %or83, ptr undef, align 4, !tbaa !4
+  %add.ptr86 = getelementptr inbounds i8, ptr %p.359, i64 4
+  %cmp66 = icmp ult ptr %add.ptr86, undef
   br i1 %cmp66, label %for.body68, label %sw.epilog
 
 sw.epilog:                                        ; preds = %for.body68

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index 9f876d7bfa8e..259c52d65485 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -19,7 +19,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; }
 
 ; Function Attrs: nounwind uwtable
-define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %k, i32 %m) #0 {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
@@ -31,10 +31,10 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
 ; CHECK:       for.end.us:
-; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
 ; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP4]], 3
-; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    store i32 [[ADD10_US]], ptr [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
 ; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
@@ -44,10 +44,10 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
 ; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP5]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0
 ; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP6]], 1
-; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    store i32 [[ADD5_US]], ptr [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
 ; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
@@ -58,7 +58,7 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
 ; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP9]], [[K]]
-; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
+; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV33]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
@@ -70,21 +70,20 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[ADD_US]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
-; CHECK-NEXT:    store i32 [[TMP20]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
@@ -101,10 +100,10 @@ entry:
   br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
 
 for.end.us:                                       ; preds = %for.body3.us
-  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
-  %0 = load i32, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx9.us = getelementptr inbounds i32, ptr %b, i64 %indvars.iv33
+  %0 = load i32, ptr %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
   %add10.us = add nsw i32 %0, 3
-  store i32 %add10.us, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
+  store i32 %add10.us, ptr %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
   %indvars.iv.next34 = add i64 %indvars.iv33, 1
   %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
   %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
@@ -115,10 +114,10 @@ for.body3.us:                                     ; preds = %for.body3.us, %for.
   %1 = trunc i64 %indvars.iv29 to i32
   %add4.us = add i32 %add.us, %1
   %idxprom.us = sext i32 %add4.us to i64
-  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
-  %2 = load i32, i32* %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx.us = getelementptr inbounds i32, ptr %a, i64 %idxprom.us
+  %2 = load i32, ptr %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3
   %add5.us = add nsw i32 %2, 1
-  store i32 %add5.us, i32* %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3
+  store i32 %add5.us, ptr %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3
   %indvars.iv.next30 = add i64 %indvars.iv29, 1
   %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
   %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
@@ -128,7 +127,7 @@ for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
   %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
   %3 = trunc i64 %indvars.iv33 to i32
   %add.us = add i32 %3, %k
-  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
+  %arrayidx7.us = getelementptr inbounds i32, ptr %a, i64 %indvars.iv33
   br label %for.body3.us
 
 for.end15:                                        ; preds = %for.end.us, %entry
@@ -139,7 +138,7 @@ for.end15:                                        ; preds = %for.end.us, %entry
 
 ; Here we can see the vectorizer does the mem dep checks and decides it is
 ; unsafe to vectorize.
-define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+define void @no-par-mem-metadata(ptr nocapture %a, ptr nocapture %b, i32 %k, i32 %m) #0 {
 ; CHECK-LABEL: @no-par-mem-metadata(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
@@ -147,10 +146,10 @@ define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i
 ; CHECK:       for.body3.lr.ph.us.preheader:
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
 ; CHECK:       for.end.us:
-; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX9_US]], align 4
 ; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP0]], 3
-; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    store i32 [[ADD10_US]], ptr [[ARRAYIDX9_US]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
 ; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
@@ -160,10 +159,10 @@ define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
 ; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP1]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4
+; CHECK-NEXT:    store i32 [[ADD5_US]], ptr [[ARRAYIDX7_US:%.*]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
 ; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
@@ -172,7 +171,7 @@ define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i
 ; CHECK-NEXT:    [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
 ; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP3]], [[K:%.*]]
-; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
+; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV33]]
 ; CHECK-NEXT:    br label [[FOR_BODY3_US]]
 ; CHECK:       for.end15.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END15]]
@@ -184,10 +183,10 @@ entry:
   br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
 
 for.end.us:                                       ; preds = %for.body3.us
-  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
-  %0 = load i32, i32* %arrayidx9.us, align 4
+  %arrayidx9.us = getelementptr inbounds i32, ptr %b, i64 %indvars.iv33
+  %0 = load i32, ptr %arrayidx9.us, align 4
   %add10.us = add nsw i32 %0, 3
-  store i32 %add10.us, i32* %arrayidx9.us, align 4
+  store i32 %add10.us, ptr %arrayidx9.us, align 4
   %indvars.iv.next34 = add i64 %indvars.iv33, 1
   %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
   %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
@@ -198,10 +197,10 @@ for.body3.us:                                     ; preds = %for.body3.us, %for.
   %1 = trunc i64 %indvars.iv29 to i32
   %add4.us = add i32 %add.us, %1
   %idxprom.us = sext i32 %add4.us to i64
-  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
-  %2 = load i32, i32* %arrayidx.us, align 4
+  %arrayidx.us = getelementptr inbounds i32, ptr %a, i64 %idxprom.us
+  %2 = load i32, ptr %arrayidx.us, align 4
   %add5.us = add nsw i32 %2, 1
-  store i32 %add5.us, i32* %arrayidx7.us, align 4
+  store i32 %add5.us, ptr %arrayidx7.us, align 4
   %indvars.iv.next30 = add i64 %indvars.iv29, 1
   %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
   %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
@@ -211,7 +210,7 @@ for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
   %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
   %3 = trunc i64 %indvars.iv33 to i32
   %add.us = add i32 %3, %k
-  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
+  %arrayidx7.us = getelementptr inbounds i32, ptr %a, i64 %indvars.iv33
   br label %for.body3.us
 
 for.end15:                                        ; preds = %for.end.us, %entry

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
index 4efdc43f02b9..1c7f3a70d37c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -9,15 +9,15 @@
 ;   if (arr[i] != 42)
 ;     tot += arr[i];
 
-define double @sumIfScalar(double* nocapture readonly %arr) {
+define double @sumIfScalar(ptr nocapture readonly %arr) {
 ; CHECK-LABEL: @sumIfScalar(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
 ; CHECK-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
-; CHECK-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
 ; CHECK-NEXT:    [[TST:%.*]] = fcmp une double [[NEXTVAL]], 4.200000e+01
 ; CHECK-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
 ; CHECK:       do.add:
@@ -41,8 +41,8 @@ loop:
   %i = phi i32 [0, %entry], [%i.next, %next.iter]
   %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
 
-  %addr = getelementptr double, double* %arr, i32 %i
-  %nextval = load double, double* %addr
+  %addr = getelementptr double, ptr %arr, i32 %i
+  %nextval = load double, ptr %addr
 
   %tst = fcmp une double %nextval, 42.0
   br i1 %tst, label %do.add, label %no.add
@@ -64,7 +64,7 @@ done:
   ret double %tot.next
 }
 
-define double @sumIfVector(double* nocapture readonly %arr) {
+define double @sumIfVector(ptr nocapture readonly %arr) {
 ; SSE-LABEL: @sumIfVector(
 ; SSE-NEXT:  entry:
 ; SSE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -74,30 +74,29 @@ define double @sumIfVector(double* nocapture readonly %arr) {
 ; SSE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SSE-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; SSE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; SSE-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
-; SSE-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>*
-; SSE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; SSE-NEXT:    [[TMP4:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
-; SSE-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
-; SSE-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[VEC_PHI]]
+; SSE-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[TMP0]]
+; SSE-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i32 0
+; SSE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
+; SSE-NEXT:    [[TMP3:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01>
+; SSE-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
+; SSE-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
+; SSE-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP4]], <2 x double> [[VEC_PHI]]
 ; SSE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; SSE-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; SSE-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SSE-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; SSE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SSE:       middle.block:
-; SSE-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[PREDPHI]])
+; SSE-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[PREDPHI]])
 ; SSE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
 ; SSE-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
 ; SSE:       scalar.ph:
 ; SSE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SSE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; SSE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; SSE-NEXT:    br label [[LOOP:%.*]]
 ; SSE:       loop:
 ; SSE-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
 ; SSE-NEXT:    [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
-; SSE-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
-; SSE-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
+; SSE-NEXT:    [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]]
+; SSE-NEXT:    [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
 ; SSE-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
 ; SSE-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
 ; SSE:       do.add:
@@ -111,7 +110,7 @@ define double @sumIfVector(double* nocapture readonly %arr) {
 ; SSE-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
 ; SSE-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]]
 ; SSE:       done:
-; SSE-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; SSE-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; SSE-NEXT:    ret double [[TOT_NEXT_LCSSA]]
 ;
 ; AVX-LABEL: @sumIfVector(
@@ -123,30 +122,29 @@ define double @sumIfVector(double* nocapture readonly %arr) {
 ; AVX-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; AVX-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; AVX-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
-; AVX-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
-; AVX-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <4 x double>*
-; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 8
-; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
-; AVX-NEXT:    [[TMP5:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
-; AVX-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; AVX-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x double> [[TMP5]], <4 x double> [[VEC_PHI]]
+; AVX-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[TMP0]]
+; AVX-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i32 0
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
+; AVX-NEXT:    [[TMP3:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
+; AVX-NEXT:    [[TMP4:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
+; AVX-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
+; AVX-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]]
 ; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; AVX-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; AVX-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; AVX-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX:       middle.block:
-; AVX-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[PREDPHI]])
+; AVX-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[PREDPHI]])
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
 ; AVX-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
 ; AVX:       scalar.ph:
 ; AVX-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; AVX-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; AVX-NEXT:    br label [[LOOP:%.*]]
 ; AVX:       loop:
 ; AVX-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
 ; AVX-NEXT:    [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
-; AVX-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
-; AVX-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
+; AVX-NEXT:    [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]]
+; AVX-NEXT:    [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
 ; AVX-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
 ; AVX-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
 ; AVX:       do.add:
@@ -160,7 +158,7 @@ define double @sumIfVector(double* nocapture readonly %arr) {
 ; AVX-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
 ; AVX-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]]
 ; AVX:       done:
-; AVX-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; AVX-NEXT:    ret double [[TOT_NEXT_LCSSA]]
 ;
 entry:
@@ -170,8 +168,8 @@ loop:
   %i = phi i32 [0, %entry], [%i.next, %next.iter]
   %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
 
-  %addr = getelementptr double, double* %arr, i32 %i
-  %nextval = load double, double* %addr
+  %addr = getelementptr double, ptr %arr, i32 %i
+  %nextval = load double, ptr %addr
 
   %tst = fcmp fast une double %nextval, 42.0
   br i1 %tst, label %do.add, label %no.add

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
index f44b4aa94062..fab69c602fce 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -4,84 +4,83 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-LABEL: @inv_load_conditional(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
-; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 8
+; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[SMAX6]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[SMAX2]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775792
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT]], <16 x i32*> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT8]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x ptr> poison, ptr [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT]], <16 x ptr> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT4]], <16 x i32> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
-; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP1]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP1]], align 4, !alias.scope !0, !noalias !3
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison), !alias.scope !3
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x ptr> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison), !alias.scope !3
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[PREDPHI]], i64 15
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX6]], 8
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 8
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC11:%.*]] = and i64 [[SMAX6]], 9223372036854775800
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT15]], <8 x i32*> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT17]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[N_VEC7:%.*]] = and i64 [[SMAX2]], 9223372036854775800
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT11]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT13]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT18]], <8 x i32>* [[TMP6]], align 4, !alias.scope !7, !noalias !10
-; CHECK-NEXT:    [[INDEX_NEXT21]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC11]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX9]]
+; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT14]], ptr [[TMP5]], align 4, !alias.scope !7, !noalias !10
+; CHECK-NEXT:    [[INDEX_NEXT17]] = add nuw i64 [[INDEX9]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC7]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT16]], zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER19:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT16]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison), !alias.scope !10
-; CHECK-NEXT:    [[PREDPHI20:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER19]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI20]], i64 7
-; CHECK-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC11]]
-; CHECK-NEXT:    br i1 [[CMP_N12]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x ptr> [[BROADCAST_SPLAT12]], zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[BROADCAST_SPLAT12]], i32 4, <8 x i1> [[TMP7]], <8 x i32> poison), !alias.scope !10
+; CHECK-NEXT:    [[PREDPHI16:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[WIDE_MASKED_GATHER15]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[PREDPHI16]], i64 7
+; CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC7]]
+; CHECK-NEXT:    br i1 [[CMP_N8]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC7]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32* [[A]], null
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[A]], null
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[I1]], align 4
 ; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[LATCH]], label [[COND_LOAD:%.*]]
 ; CHECK:       cond_load:
-; CHECK-NEXT:    [[ALOAD:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    [[ALOAD:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[ALOAD]], [[COND_LOAD]] ], [ 1, [[FOR_BODY]] ]
@@ -89,7 +88,7 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[TMP9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[A_LCSSA_LCSSA]]
 ;
 entry:
@@ -98,14 +97,14 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
-  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %i2 = load i32, i32* %i1, align 8
-  %cmp = icmp ne i32* %a, null
-  store i32 %ntrunc, i32* %i1
+  %i1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %i2 = load i32, ptr %i1, align 8
+  %cmp = icmp ne ptr %a, null
+  store i32 %ntrunc, ptr %i1
   br i1 %cmp, label %cond_load, label %latch
 
 cond_load:
-  %aload = load i32, i32* %a, align 4
+  %aload = load i32, ptr %a, align 4
   br label %latch
 
 latch:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index b45103ce4853..1f5336d1b6f0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -7,101 +7,97 @@ target triple = "x86_64-unknown-linux-gnu"
 ; first test checks that loop with a reduction and a uniform store gets
 ; vectorized.
 
-define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) {
 ; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
-; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 8
+; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[SMAX6]], 64
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[SMAX2]], 64
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775744
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775744
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 32
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 48
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[TMP8]] = add <16 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9]] = add <16 x i32> [[VEC_PHI8]], [[WIDE_LOAD11]]
-; CHECK-NEXT:    [[TMP10]] = add <16 x i32> [[VEC_PHI9]], [[WIDE_LOAD12]]
-; CHECK-NEXT:    [[TMP11]] = add <16 x i32> [[VEC_PHI10]], [[WIDE_LOAD13]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP2]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 32
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP3]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 48
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i32>, ptr [[TMP4]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6]] = add <16 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[TMP7]] = add <16 x i32> [[VEC_PHI5]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP8]] = add <16 x i32> [[VEC_PHI6]], [[WIDE_LOAD9]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[BIN_RDX14:%.*]] = add <16 x i32> [[TMP10]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX15:%.*]] = add <16 x i32> [[TMP11]], [[BIN_RDX14]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX15]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <16 x i32> [[TMP7]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP8]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX6]], 56
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 56
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC17:%.*]] = and i64 [[SMAX6]], 9223372036854775800
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[N_VEC13:%.*]] = and i64 [[SMAX2]], 9223372036854775800
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 [[BC_MERGE_RDX]], i64 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <8 x i32> [ [[TMP14]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 8, !alias.scope !7
-; CHECK-NEXT:    [[TMP17]] = add <8 x i32> [[VEC_PHI20]], [[WIDE_LOAD21]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !10, !noalias !7
-; CHECK-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC17]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[INDEX15:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <8 x i32> [ [[TMP11]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX15]]
+; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x i32>, ptr [[TMP12]], align 8, !alias.scope !7
+; CHECK-NEXT:    [[TMP13]] = add <8 x i32> [[VEC_PHI16]], [[WIDE_LOAD17]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !10, !noalias !7
+; CHECK-NEXT:    [[INDEX_NEXT18]] = add nuw i64 [[INDEX15]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT18]], [[N_VEC13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP17]])
-; CHECK-NEXT:    [[CMP_N18:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC17]]
-; CHECK-NEXT:    br i1 [[CMP_N18]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]])
+; CHECK-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC13]]
+; CHECK-NEXT:    br i1 [[CMP_N14]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i32 [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX19:%.*]] = phi i32 [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* [[T1]], align 8
+; CHECK-NEXT:    [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX19]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[T1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[T2:%.*]] = load i32, ptr [[T1]], align 8
 ; CHECK-NEXT:    [[T3]] = add i32 [[T0]], [[T2]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[T4]]
 ;
 entry:
@@ -111,10 +107,10 @@ entry:
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
   %t0 = phi i32 [ %t3, %for.body ], [ 0, %entry ]
-  %t1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %t2 = load i32, i32* %t1, align 8
+  %t1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %t2 = load i32, ptr %t1, align 8
   %t3 = add i32 %t0, %t2
-  store i32 %ntrunc, i32* %a
+  store i32 %ntrunc, ptr %a
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -126,89 +122,86 @@ for.end:                                          ; preds = %for.body
 
 ; Conditional store
 ; if (b[i] == k) a = ntrunc
-define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
-; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 8
+; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[SMAX6]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[SMAX2]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775792
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775792
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[K:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT8]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <16 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT10]], <16 x i32*> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT4]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x ptr> poison, ptr [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT6]], <16 x ptr> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 8, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !15, !noalias !18
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
-; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP3]], align 4, !alias.scope !15, !noalias !18
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT9]], <16 x i32*> [[BROADCAST_SPLAT11]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !18
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP1]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[BROADCAST_SPLAT5]], <16 x ptr> [[BROADCAST_SPLAT7]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !18
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX6]], 8
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 8
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC13:%.*]] = and i64 [[SMAX6]], 9223372036854775800
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT17]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT19]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT22:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT21]], <8 x i32*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[N_VEC9:%.*]] = and i64 [[SMAX2]], 9223372036854775800
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT13]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT17]], <8 x ptr> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 8, !alias.scope !21, !noalias !24
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD16]], [[BROADCAST_SPLAT18]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT20]], <8 x i32>* [[TMP8]], align 4, !alias.scope !21, !noalias !24
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[BROADCAST_SPLAT20]], <8 x i32*> [[BROADCAST_SPLAT22]], i32 4, <8 x i1> [[TMP7]]), !alias.scope !24
-; CHECK-NEXT:    [[INDEX_NEXT23]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC13]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX11]]
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP4]], align 8, !alias.scope !21, !noalias !24
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD12]], [[BROADCAST_SPLAT14]]
+; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT16]], ptr [[TMP4]], align 4, !alias.scope !21, !noalias !24
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[BROADCAST_SPLAT16]], <8 x ptr> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP5]]), !alias.scope !24
+; CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX11]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC9]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC13]]
-; CHECK-NEXT:    br i1 [[CMP_N14]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC9]]
+; CHECK-NEXT:    br i1 [[CMP_N10]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* [[T1]], align 8
+; CHECK-NEXT:    [[T1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[T2:%.*]] = load i32, ptr [[T1]], align 8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T2]], [[K]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[T1]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[T1]], align 4
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
 ; CHECK:       cond_store:
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
@@ -223,14 +216,14 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
-  %t1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %t2 = load i32, i32* %t1, align 8
+  %t1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %t2 = load i32, ptr %t1, align 8
   %cmp = icmp eq i32 %t2, %k
-  store i32 %ntrunc, i32* %t1
+  store i32 %ntrunc, ptr %t1
   br i1 %cmp, label %cond_store, label %latch
 
 cond_store:
-  store i32 %ntrunc, i32* %a
+  store i32 %ntrunc, ptr %a
   br label %latch
 
 latch:
@@ -242,106 +235,101 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32* %c, i32 %k) {
+define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, ptr %c, i32 %k) {
 ; CHECK-LABEL: @variant_val_store_to_inv_address_conditional(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
-; CHECK-NEXT:    [[SMAX16:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX16]], 8
+; CHECK-NEXT:    [[SMAX10:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX10]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND09:%.*]] = icmp ugt i32* [[SCEVGEP7]], [[B]]
-; CHECK-NEXT:    [[BOUND110:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]]
-; CHECK-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ugt i32* [[SCEVGEP7]], [[A]]
-; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[C]]
-; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ugt ptr [[UGLYGEP2]], [[B]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ugt ptr [[UGLYGEP]], [[C]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    [[BOUND06:%.*]] = icmp ugt ptr [[UGLYGEP2]], [[A]]
+; CHECK-NEXT:    [[BOUND17:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[C]]
+; CHECK-NEXT:    [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]]
+; CHECK-NEXT:    [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX9]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK17:%.*]] = icmp ult i64 [[SMAX16]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK17]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK11:%.*]] = icmp ult i64 [[SMAX10]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK11]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX16]], 9223372036854775792
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX10]], 9223372036854775792
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[K:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT18]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <16 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT21:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT20]], <16 x i32*> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT12]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <16 x ptr> poison, ptr [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT14]], <16 x ptr> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 8, !alias.scope !28, !noalias !31
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !28, !noalias !31
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
-; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT19]], <16 x i32>* [[TMP3]], align 4, !alias.scope !28, !noalias !31
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP5]], i32 8, <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope !34
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT21]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !35, !noalias !34
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT13]], ptr [[TMP1]], align 4, !alias.scope !28, !noalias !31
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 8, <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope !34
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x ptr> [[BROADCAST_SPLAT15]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !35, !noalias !34
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX16]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX16]], 8
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX10]], 8
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC23:%.*]] = and i64 [[SMAX16]], 9223372036854775800
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT28:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT27]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT30:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT29]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT32:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT33:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT32]], <8 x i32*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[N_VEC17:%.*]] = and i64 [[SMAX10]], 9223372036854775800
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT22:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT21]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT24:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT23]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT27:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT26]], <8 x ptr> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT34:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <8 x i32>, <8 x i32>* [[TMP8]], align 8, !alias.scope !37, !noalias !40
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD26]], [[BROADCAST_SPLAT28]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>*
-; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT30]], <8 x i32>* [[TMP10]], align 4, !alias.scope !37, !noalias !40
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[C]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD31:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 8, <8 x i1> [[TMP9]], <8 x i32> poison), !alias.scope !43
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[WIDE_MASKED_LOAD31]], <8 x i32*> [[BROADCAST_SPLAT33]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !44, !noalias !43
-; CHECK-NEXT:    [[INDEX_NEXT34]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT34]], [[N_VEC23]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
+; CHECK-NEXT:    [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX19]]
+; CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <8 x i32>, ptr [[TMP5]], align 8, !alias.scope !37, !noalias !40
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD20]], [[BROADCAST_SPLAT22]]
+; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT24]], ptr [[TMP5]], align 4, !alias.scope !37, !noalias !40
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX19]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 8, <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope !43
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD25]], <8 x ptr> [[BROADCAST_SPLAT27]], i32 4, <8 x i1> [[TMP6]]), !alias.scope !44, !noalias !43
+; CHECK-NEXT:    [[INDEX_NEXT28]] = add nuw i64 [[INDEX19]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC17]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N24:%.*]] = icmp eq i64 [[SMAX16]], [[N_VEC23]]
-; CHECK-NEXT:    br i1 [[CMP_N24]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N18:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC17]]
+; CHECK-NEXT:    br i1 [[CMP_N18]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* [[T1]], align 8
+; CHECK-NEXT:    [[T1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[T2:%.*]] = load i32, ptr [[T1]], align 8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T2]], [[K]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[T1]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[T1]], align 4
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
 ; CHECK:       cond_store:
-; CHECK-NEXT:    [[T3:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
-; CHECK-NEXT:    [[T4:%.*]] = load i32, i32* [[T3]], align 8
-; CHECK-NEXT:    store i32 [[T4]], i32* [[A]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[I]]
+; CHECK-NEXT:    [[T4:%.*]] = load i32, ptr [[T3]], align 8
+; CHECK-NEXT:    store i32 [[T4]], ptr [[A]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
@@ -356,16 +344,16 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
-  %t1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %t2 = load i32, i32* %t1, align 8
+  %t1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %t2 = load i32, ptr %t1, align 8
   %cmp = icmp eq i32 %t2, %k
-  store i32 %ntrunc, i32* %t1
+  store i32 %ntrunc, ptr %t1
   br i1 %cmp, label %cond_store, label %latch
 
 cond_store:
-  %t3 = getelementptr inbounds i32, i32* %c, i64 %i
-  %t4 = load i32, i32* %t3, align 8
-  store i32 %t4, i32* %a
+  %t3 = getelementptr inbounds i32, ptr %c, i64 %i
+  %t4 = load i32, ptr %t3, align 8
+  store i32 %t4, ptr %a
   br label %latch
 
 latch:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
index 17fafe50a759..e5aed27a9f24 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 target triple = "x86_64-unknown-linux-gnu"
 
 ; TODO: Make sure selected VF for the epilog loop doesn't exceed remaining TC.
-define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
+define void @test1(ptr noalias %src, ptr noalias %dst) #0 {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
@@ -16,17 +16,15 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 64
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 17, 16
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -36,19 +34,17 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <8 x i8>*
-; CHECK-NEXT:    store <8 x i8> [[WIDE_LOAD3]], <8 x i8>* [[TMP14]], align 64
-; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16
-; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX2]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    store <8 x i8> [[WIDE_LOAD3]], ptr [[TMP10]], align 64
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 17, 16
 ; CHECK-NEXT:    br i1 [[CMP_N1]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -57,10 +53,10 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
 ; CHECK-NEXT:    br label [[LOOP_MEMCPY_EXPANSION:%.*]]
 ; CHECK:       loop-memcpy-expansion:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; CHECK-NEXT:    [[LDADDR:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, i8* [[LDADDR]], align 64
-; CHECK-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I]]
-; CHECK-NEXT:    store i8 [[VAL]], i8* [[STADDR]], align 64
+; CHECK-NEXT:    [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[LDADDR]], align 64
+; CHECK-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I]]
+; CHECK-NEXT:    store i8 [[VAL]], ptr [[STADDR]], align 64
 ; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 17
 ; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP_MEMCPY_EXPANSION]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -72,10 +68,10 @@ entry:
 
 loop-memcpy-expansion:
   %i = phi i64 [ 0, %entry ], [ %i.next, %loop-memcpy-expansion ]
-  %ldaddr = getelementptr inbounds i8, i8 * %src, i64 %i
-  %val = load i8, i8 * %ldaddr, align 64
-  %staddr = getelementptr inbounds i8, i8 * %dst, i64 %i
-  store i8 %val, i8 * %staddr, align 64
+  %ldaddr = getelementptr inbounds i8, ptr %src, i64 %i
+  %val = load i8, ptr %ldaddr, align 64
+  %staddr = getelementptr inbounds i8, ptr %dst, i64 %i
+  store i8 %val, ptr %staddr, align 64
   %i.next = add i64 %i, 1
   %is.next = icmp ult i64 %i.next, 17
   br i1 %is.next, label %loop-memcpy-expansion, label %exit

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 8f9a9164de80..40a449f71172 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;; We don't need a masked.load for this due to deref facts, and can instead
 ;; use a plain vector load.
 
-declare void @init(i32* nocapture nofree)
+declare void @init(ptr nocapture nofree)
 
 ;; For ease of explanation, this one demonstrates
 ;; with a range check, but there are better lowering options specifically for
@@ -18,8 +18,7 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-LABEL: @test_explicit_pred(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[LEN:%.*]], i32 0
@@ -34,10 +33,10 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
@@ -49,48 +48,44 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT8]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT10]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT12]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, i32* [[TMP8]], i32 8
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP8]], i32 12
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP8]], i32 12
+; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP19:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI16:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[WIDE_LOAD13]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[WIDE_LOAD14]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI18:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[WIDE_LOAD15]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP25]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI16]]
-; CHECK-NEXT:    [[TMP26]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI17]]
-; CHECK-NEXT:    [[TMP27]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI18]]
+; CHECK-NEXT:    [[TMP20]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP21]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI16]]
+; CHECK-NEXT:    [[TMP22]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI17]]
+; CHECK-NEXT:    [[TMP23]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI18]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP25]], [[TMP24]]
-; CHECK-NEXT:    [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP26]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX20:%.*]] = add <4 x i32> [[TMP27]], [[BIN_RDX19]]
-; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP22]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX20:%.*]] = add <4 x i32> [[TMP23]], [[BIN_RDX19]]
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -99,8 +94,8 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-NEXT:    [[EARLYCND:%.*]] = icmp slt i64 [[IV]], [[LEN]]
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -108,13 +103,12 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
-  %base = bitcast [4096 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
@@ -123,8 +117,8 @@ loop:
   %earlycnd = icmp slt i64 %iv, %len
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -137,21 +131,20 @@ loop_exit:
 }
 
 ;; Similiar to the above, but without an analyzeable condition.
-define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) {
+define i32 @test_explicit_pred_generic(i64 %len, ptr %test_base) {
 ; CHECK-LABEL: @test_explicit_pred_generic(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -168,106 +161,102 @@ define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -275,24 +264,23 @@ define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
-  %base = bitcast [4096 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -308,12 +296,11 @@ loop_exit:
 ; there are better lowerings, this is a test of robustness of vectorization,
 ; nothing more.)
 ; TODO: currently shows predication which can be removed
-define i32 @test_invariant_address(i64 %len, i1* %test_base) {
+define i32 @test_invariant_address(i64 %len, ptr %test_base) {
 ; CHECK-LABEL: @test_invariant_address(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -339,82 +326,82 @@ define i32 @test_invariant_address(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP65:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP67:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP64:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP65:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP66:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP67:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <4 x i32> poison, i32 [[TMP64]], i32 0
 ; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1
 ; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2
 ; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3
-; CHECK-NEXT:    [[TMP72:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP73:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP74:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP72:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP73:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP74:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP75:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> poison, i32 [[TMP72]], i32 0
 ; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1
 ; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2
 ; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3
-; CHECK-NEXT:    [[TMP80:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP82:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP83:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP80:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP82:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP83:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
 ; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
 ; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
 ; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
-; CHECK-NEXT:    [[TMP88:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP90:%.*]] = load i32, i32* [[BASE]], align 4
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
 ; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
 ; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
@@ -449,11 +436,11 @@ define i32 @test_invariant_address(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -466,18 +453,17 @@ define i32 @test_invariant_address(i64 %len, i1* %test_base) {
 ;
 entry:
   %alloca = alloca [4096 x i32]
-  %base = bitcast [4096 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %val = load i32, i32* %base
+  %val = load i32, ptr %alloca
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -490,21 +476,20 @@ loop_exit:
 }
 
 ; Overlapping loads - Fails alignment checking, not dereferenceability
-define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
+define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-LABEL: @test_step_narrower_than_access(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP181:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP182:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP183:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP148:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP149:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP150:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP151:%.*]], [[PRED_LOAD_CONTINUE33]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -521,50 +506,50 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -572,217 +557,183 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP65:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP67:%.*]] = bitcast i16* [[TMP66]] to i32*
-; CHECK-NEXT:    [[TMP68:%.*]] = load i32, i32* [[TMP67]], align 4
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> poison, i32 [[TMP68]], i32 0
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP70:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP69]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1
-; CHECK-NEXT:    br i1 [[TMP71]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
+; CHECK-NEXT:    [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1
+; CHECK-NEXT:    br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
 ; CHECK:       pred.load.if4:
-; CHECK-NEXT:    [[TMP72:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i16, i16* [[TMP72]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP74:%.*]] = bitcast i16* [[TMP73]] to i32*
-; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* [[TMP74]], align 4
-; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP75]], i32 1
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
 ; CHECK:       pred.load.continue5:
-; CHECK-NEXT:    [[TMP77:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP76]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2
-; CHECK-NEXT:    br i1 [[TMP78]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
+; CHECK-NEXT:    [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ]
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2
+; CHECK-NEXT:    br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
 ; CHECK:       pred.load.if6:
-; CHECK-NEXT:    [[TMP79:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i16, i16* [[TMP79]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP81:%.*]] = bitcast i16* [[TMP80]] to i32*
-; CHECK-NEXT:    [[TMP82:%.*]] = load i32, i32* [[TMP81]], align 4
-; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP82]], i32 2
+; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
 ; CHECK:       pred.load.continue7:
-; CHECK-NEXT:    [[TMP84:%.*]] = phi <4 x i32> [ [[TMP77]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP83]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3
-; CHECK-NEXT:    br i1 [[TMP85]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
+; CHECK-NEXT:    [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ]
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3
+; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
 ; CHECK:       pred.load.if8:
-; CHECK-NEXT:    [[TMP86:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP86]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP88:%.*]] = bitcast i16* [[TMP87]] to i32*
-; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[TMP88]], align 4
-; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP89]], i32 3
+; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4
+; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
 ; CHECK:       pred.load.continue9:
-; CHECK-NEXT:    [[TMP91:%.*]] = phi <4 x i32> [ [[TMP84]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP90]], [[PRED_LOAD_IF8]] ]
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0
-; CHECK-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
+; CHECK-NEXT:    [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ]
+; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0
+; CHECK-NEXT:    br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
 ; CHECK:       pred.load.if10:
-; CHECK-NEXT:    [[TMP93:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i16, i16* [[TMP93]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP95:%.*]] = bitcast i16* [[TMP94]] to i32*
-; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
 ; CHECK:       pred.load.continue11:
-; CHECK-NEXT:    [[TMP98:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP97]], [[PRED_LOAD_IF10]] ]
-; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1
-; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
+; CHECK-NEXT:    [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1
+; CHECK-NEXT:    br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
 ; CHECK:       pred.load.if12:
-; CHECK-NEXT:    [[TMP100:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds i16, i16* [[TMP100]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP102:%.*]] = bitcast i16* [[TMP101]] to i32*
-; CHECK-NEXT:    [[TMP103:%.*]] = load i32, i32* [[TMP102]], align 4
-; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP103]], i32 1
+; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4
+; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
 ; CHECK:       pred.load.continue13:
-; CHECK-NEXT:    [[TMP105:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP104]], [[PRED_LOAD_IF12]] ]
-; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2
-; CHECK-NEXT:    br i1 [[TMP106]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
+; CHECK-NEXT:    [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ]
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2
+; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
 ; CHECK:       pred.load.if14:
-; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds i16, i16* [[TMP107]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP109:%.*]] = bitcast i16* [[TMP108]] to i32*
-; CHECK-NEXT:    [[TMP110:%.*]] = load i32, i32* [[TMP109]], align 4
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x i32> [[TMP105]], i32 [[TMP110]], i32 2
+; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4
+; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
 ; CHECK:       pred.load.continue15:
-; CHECK-NEXT:    [[TMP112:%.*]] = phi <4 x i32> [ [[TMP105]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP111]], [[PRED_LOAD_IF14]] ]
-; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3
-; CHECK-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
+; CHECK-NEXT:    [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ]
+; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3
+; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
 ; CHECK:       pred.load.if16:
-; CHECK-NEXT:    [[TMP114:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i16, i16* [[TMP114]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP116:%.*]] = bitcast i16* [[TMP115]] to i32*
-; CHECK-NEXT:    [[TMP117:%.*]] = load i32, i32* [[TMP116]], align 4
-; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP117]], i32 3
+; CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
 ; CHECK:       pred.load.continue17:
-; CHECK-NEXT:    [[TMP119:%.*]] = phi <4 x i32> [ [[TMP112]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP118]], [[PRED_LOAD_IF16]] ]
-; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0
-; CHECK-NEXT:    br i1 [[TMP120]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
+; CHECK-NEXT:    [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ]
+; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0
+; CHECK-NEXT:    br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
 ; CHECK:       pred.load.if18:
-; CHECK-NEXT:    [[TMP121:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds i16, i16* [[TMP121]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP123:%.*]] = bitcast i16* [[TMP122]] to i32*
-; CHECK-NEXT:    [[TMP124:%.*]] = load i32, i32* [[TMP123]], align 4
-; CHECK-NEXT:    [[TMP125:%.*]] = insertelement <4 x i32> poison, i32 [[TMP124]], i32 0
+; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4
+; CHECK-NEXT:    [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
 ; CHECK:       pred.load.continue19:
-; CHECK-NEXT:    [[TMP126:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP125]], [[PRED_LOAD_IF18]] ]
-; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1
-; CHECK-NEXT:    br i1 [[TMP127]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
+; CHECK-NEXT:    [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ]
+; CHECK-NEXT:    [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1
+; CHECK-NEXT:    br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
 ; CHECK:       pred.load.if20:
-; CHECK-NEXT:    [[TMP128:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i16, i16* [[TMP128]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP130:%.*]] = bitcast i16* [[TMP129]] to i32*
-; CHECK-NEXT:    [[TMP131:%.*]] = load i32, i32* [[TMP130]], align 4
-; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <4 x i32> [[TMP126]], i32 [[TMP131]], i32 1
+; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4
+; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
 ; CHECK:       pred.load.continue21:
-; CHECK-NEXT:    [[TMP133:%.*]] = phi <4 x i32> [ [[TMP126]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP132]], [[PRED_LOAD_IF20]] ]
-; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2
-; CHECK-NEXT:    br i1 [[TMP134]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
+; CHECK-NEXT:    [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ]
+; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2
+; CHECK-NEXT:    br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
 ; CHECK:       pred.load.if22:
-; CHECK-NEXT:    [[TMP135:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i16, i16* [[TMP135]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP137:%.*]] = bitcast i16* [[TMP136]] to i32*
-; CHECK-NEXT:    [[TMP138:%.*]] = load i32, i32* [[TMP137]], align 4
-; CHECK-NEXT:    [[TMP139:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP138]], i32 2
+; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4
+; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE23]]
 ; CHECK:       pred.load.continue23:
-; CHECK-NEXT:    [[TMP140:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP139]], [[PRED_LOAD_IF22]] ]
-; CHECK-NEXT:    [[TMP141:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3
-; CHECK-NEXT:    br i1 [[TMP141]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
+; CHECK-NEXT:    [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ]
+; CHECK-NEXT:    [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3
+; CHECK-NEXT:    br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
 ; CHECK:       pred.load.if24:
-; CHECK-NEXT:    [[TMP142:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP143:%.*]] = getelementptr inbounds i16, i16* [[TMP142]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP144:%.*]] = bitcast i16* [[TMP143]] to i32*
-; CHECK-NEXT:    [[TMP145:%.*]] = load i32, i32* [[TMP144]], align 4
-; CHECK-NEXT:    [[TMP146:%.*]] = insertelement <4 x i32> [[TMP140]], i32 [[TMP145]], i32 3
+; CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4
+; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE25]]
 ; CHECK:       pred.load.continue25:
-; CHECK-NEXT:    [[TMP147:%.*]] = phi <4 x i32> [ [[TMP140]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP146]], [[PRED_LOAD_IF24]] ]
-; CHECK-NEXT:    [[TMP148:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0
-; CHECK-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
+; CHECK-NEXT:    [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ]
+; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0
+; CHECK-NEXT:    br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
 ; CHECK:       pred.load.if26:
-; CHECK-NEXT:    [[TMP149:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP150:%.*]] = getelementptr inbounds i16, i16* [[TMP149]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP151:%.*]] = bitcast i16* [[TMP150]] to i32*
-; CHECK-NEXT:    [[TMP152:%.*]] = load i32, i32* [[TMP151]], align 4
-; CHECK-NEXT:    [[TMP153:%.*]] = insertelement <4 x i32> poison, i32 [[TMP152]], i32 0
+; CHECK-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4
+; CHECK-NEXT:    [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE27]]
 ; CHECK:       pred.load.continue27:
-; CHECK-NEXT:    [[TMP154:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP153]], [[PRED_LOAD_IF26]] ]
-; CHECK-NEXT:    [[TMP155:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1
-; CHECK-NEXT:    br i1 [[TMP155]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
+; CHECK-NEXT:    [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ]
+; CHECK-NEXT:    [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1
+; CHECK-NEXT:    br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
 ; CHECK:       pred.load.if28:
-; CHECK-NEXT:    [[TMP156:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP157:%.*]] = getelementptr inbounds i16, i16* [[TMP156]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP158:%.*]] = bitcast i16* [[TMP157]] to i32*
-; CHECK-NEXT:    [[TMP159:%.*]] = load i32, i32* [[TMP158]], align 4
-; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <4 x i32> [[TMP154]], i32 [[TMP159]], i32 1
+; CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4
+; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE29]]
 ; CHECK:       pred.load.continue29:
-; CHECK-NEXT:    [[TMP161:%.*]] = phi <4 x i32> [ [[TMP154]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP160]], [[PRED_LOAD_IF28]] ]
-; CHECK-NEXT:    [[TMP162:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2
-; CHECK-NEXT:    br i1 [[TMP162]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
+; CHECK-NEXT:    [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ]
+; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2
+; CHECK-NEXT:    br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
 ; CHECK:       pred.load.if30:
-; CHECK-NEXT:    [[TMP163:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i16, i16* [[TMP163]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP165:%.*]] = bitcast i16* [[TMP164]] to i32*
-; CHECK-NEXT:    [[TMP166:%.*]] = load i32, i32* [[TMP165]], align 4
-; CHECK-NEXT:    [[TMP167:%.*]] = insertelement <4 x i32> [[TMP161]], i32 [[TMP166]], i32 2
+; CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4
+; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE31]]
 ; CHECK:       pred.load.continue31:
-; CHECK-NEXT:    [[TMP168:%.*]] = phi <4 x i32> [ [[TMP161]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP167]], [[PRED_LOAD_IF30]] ]
-; CHECK-NEXT:    [[TMP169:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3
-; CHECK-NEXT:    br i1 [[TMP169]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]]
+; CHECK-NEXT:    [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ]
+; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3
+; CHECK-NEXT:    br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]]
 ; CHECK:       pred.load.if32:
-; CHECK-NEXT:    [[TMP170:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[TMP171:%.*]] = getelementptr inbounds i16, i16* [[TMP170]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP172:%.*]] = bitcast i16* [[TMP171]] to i32*
-; CHECK-NEXT:    [[TMP173:%.*]] = load i32, i32* [[TMP172]], align 4
-; CHECK-NEXT:    [[TMP174:%.*]] = insertelement <4 x i32> [[TMP168]], i32 [[TMP173]], i32 3
+; CHECK-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4
+; CHECK-NEXT:    [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE33]]
 ; CHECK:       pred.load.continue33:
-; CHECK-NEXT:    [[TMP175:%.*]] = phi <4 x i32> [ [[TMP168]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP174]], [[PRED_LOAD_IF32]] ]
-; CHECK-NEXT:    [[TMP176:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP177:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP178:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP179:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP91]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP119]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP147]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP175]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP180]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP181]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]]
-; CHECK-NEXT:    [[TMP182]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]]
-; CHECK-NEXT:    [[TMP183]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]]
+; CHECK-NEXT:    [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ]
+; CHECK-NEXT:    [[TMP144:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP145:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP146:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP147:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP148]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP149]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]]
+; CHECK-NEXT:    [[TMP150]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]]
+; CHECK-NEXT:    [[TMP151]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP184:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP184]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP152:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP152]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]]
-; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX37]]
-; CHECK-NEXT:    [[TMP185:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP149]], [[TMP148]]
+; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX37]]
+; CHECK-NEXT:    [[TMP153:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[BASE_I16P:%.*]] = bitcast i32* [[BASE]] to i16*
-; CHECK-NEXT:    [[ADDR_I16P:%.*]] = getelementptr inbounds i16, i16* [[BASE_I16P]], i64 [[IV]]
-; CHECK-NEXT:    [[ADDR:%.*]] = bitcast i16* [[ADDR_I16P]] to i32*
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR_I16P:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR_I16P]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -790,26 +741,23 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
-  %base = bitcast [4096 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %base.i16p = bitcast i32* %base to i16*
-  %addr.i16p = getelementptr inbounds i16, i16* %base.i16p, i64 %iv
-  %addr = bitcast i16* %addr.i16p to i32*
-  %val = load i32, i32* %addr
+  %addr.i16p = getelementptr inbounds i16, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr.i16p
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -821,12 +769,11 @@ loop_exit:
   ret i32 %accum.next
 }
 
-define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) {
+define i32 @test_max_trip_count(i64 %len, ptr %test_base, i64 %n) {
 ; CHECK-LABEL: @test_max_trip_count(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    [[MIN_CMP:%.*]] = icmp ult i64 4096, [[N:%.*]]
 ; CHECK-NEXT:    [[MIN_N:%.*]] = select i1 [[MIN_CMP]], i64 4096, i64 [[N]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[MIN_N]], 2
@@ -838,10 +785,10 @@ define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP84:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 2
@@ -858,106 +805,102 @@ define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = load i1, i1* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2
 ; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = load i1, i1* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP44:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2
 ; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = load i1, i1* [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP52:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2
 ; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = load i1, i1* [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP60:%.*]] = load i1, ptr [[TMP32]], align 1
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2
 ; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, i32* [[TMP65]], i32 0
-; CHECK-NEXT:    [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP70]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, i32* [[TMP65]], i32 4
-; CHECK-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP72]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr i32, i32* [[TMP65]], i32 8
-; CHECK-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP74]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr i32, i32* [[TMP65]], i32 12
-; CHECK-NEXT:    [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP76]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP48]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP56]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP80:%.*]] = xor <4 x i1> [[TMP64]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP48]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <4 x i1> [[TMP56]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP64]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP64]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP84]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP77]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP78]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP79]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP81:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP81]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP78]], [[TMP77]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP80]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP82]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -965,13 +908,12 @@ define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], [[MIN_N]]
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP82]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
-  %base = bitcast [4096 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   %min.cmp = icmp ult i64 4096, %n
   %min.n = select i1 %min.cmp, i64 4096, i64 %n
   br label %loop
@@ -979,12 +921,12 @@ loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -999,21 +941,20 @@ loop_exit:
 
 
 
-define i32 @test_non_zero_start(i64 %len, i1* %test_base) {
+define i32 @test_non_zero_start(i64 %len, ptr %test_base) {
 ; CHECK-LABEL: @test_non_zero_start(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1024, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1
@@ -1031,106 +972,102 @@ define i32 @test_non_zero_start(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3072
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3072
+; CHECK-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 3072, 3072
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1138,24 +1075,23 @@ define i32 @test_non_zero_start(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
-  %base = bitcast [4096 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ 1024, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -1167,23 +1103,22 @@ loop_exit:
   ret i32 %accum.next
 }
 
-define i32 @neg_out_of_bounds_start(i64 %len, i1* %test_base) {
+define i32 @neg_out_of_bounds_start(i64 %len, ptr %test_base) {
 ; CHECK-LABEL: @neg_out_of_bounds_start(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ -10, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1196,19 +1131,18 @@ define i32 @neg_out_of_bounds_start(i64 %len, i1* %test_base) {
 ;
 entry:
   %alloca = alloca [4096 x i32]
-  %base = bitcast [4096 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ -10, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -1222,12 +1156,11 @@ loop_exit:
 
 
 ;; TODO: handle non-unit strides
-define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
+define i32 @test_non_unit_stride(i64 %len, ptr %test_base) {
 ; CHECK-LABEL: @test_non_unit_stride(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1254,50 +1187,50 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -1305,8 +1238,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[TMP65]], align 4
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4
 ; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
@@ -1314,8 +1247,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
 ; CHECK:       pred.load.if4:
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP71:%.*]] = load i32, i32* [[TMP70]], align 4
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
 ; CHECK:       pred.load.continue5:
@@ -1323,8 +1256,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
 ; CHECK:       pred.load.if6:
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP76:%.*]] = load i32, i32* [[TMP75]], align 4
+; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4
 ; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
 ; CHECK:       pred.load.continue7:
@@ -1332,8 +1265,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
 ; CHECK:       pred.load.if8:
-; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* [[TMP80]], align 4
+; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4
 ; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
 ; CHECK:       pred.load.continue9:
@@ -1341,8 +1274,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
 ; CHECK:       pred.load.if10:
-; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP86:%.*]] = load i32, i32* [[TMP85]], align 4
+; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4
 ; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
 ; CHECK:       pred.load.continue11:
@@ -1350,8 +1283,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
 ; CHECK:       pred.load.if12:
-; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, i32* [[TMP90]], align 4
+; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4
 ; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
 ; CHECK:       pred.load.continue13:
@@ -1359,8 +1292,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
 ; CHECK:       pred.load.if14:
-; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4
+; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
 ; CHECK:       pred.load.continue15:
@@ -1368,8 +1301,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
 ; CHECK:       pred.load.if16:
-; CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP101:%.*]] = load i32, i32* [[TMP100]], align 4
+; CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4
 ; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
 ; CHECK:       pred.load.continue17:
@@ -1377,8 +1310,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
 ; CHECK:       pred.load.if18:
-; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP106:%.*]] = load i32, i32* [[TMP105]], align 4
+; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4
 ; CHECK-NEXT:    [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
 ; CHECK:       pred.load.continue19:
@@ -1386,8 +1319,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
 ; CHECK:       pred.load.if20:
-; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP111:%.*]] = load i32, i32* [[TMP110]], align 4
+; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4
 ; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
 ; CHECK:       pred.load.continue21:
@@ -1395,8 +1328,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
 ; CHECK:       pred.load.if22:
-; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP116:%.*]] = load i32, i32* [[TMP115]], align 4
+; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4
 ; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE23]]
 ; CHECK:       pred.load.continue23:
@@ -1404,8 +1337,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
 ; CHECK:       pred.load.if24:
-; CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP121:%.*]] = load i32, i32* [[TMP120]], align 4
+; CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4
 ; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE25]]
 ; CHECK:       pred.load.continue25:
@@ -1413,8 +1346,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
 ; CHECK:       pred.load.if26:
-; CHECK-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP126:%.*]] = load i32, i32* [[TMP125]], align 4
+; CHECK-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4
 ; CHECK-NEXT:    [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE27]]
 ; CHECK:       pred.load.continue27:
@@ -1422,8 +1355,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
 ; CHECK:       pred.load.if28:
-; CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP131:%.*]] = load i32, i32* [[TMP130]], align 4
+; CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4
 ; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE29]]
 ; CHECK:       pred.load.continue29:
@@ -1431,8 +1364,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
 ; CHECK:       pred.load.if30:
-; CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP136:%.*]] = load i32, i32* [[TMP135]], align 4
+; CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4
 ; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE31]]
 ; CHECK:       pred.load.continue31:
@@ -1440,8 +1373,8 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]]
 ; CHECK:       pred.load.if32:
-; CHECK-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP141:%.*]] = load i32, i32* [[TMP140]], align 4
+; CHECK-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4
 ; CHECK-NEXT:    [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE33]]
 ; CHECK:       pred.load.continue33:
@@ -1476,12 +1409,12 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 2
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1494,19 +1427,18 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ;
 entry:
   %alloca = alloca [4096 x i32]
-  %base = bitcast [4096 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 2
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -1518,21 +1450,20 @@ loop_exit:
   ret i32 %accum.next
 }
 
-define i32 @neg_off_by_many(i64 %len, i1* %test_base) {
+define i32 @neg_off_by_many(i64 %len, ptr %test_base) {
 ; CHECK-LABEL: @neg_off_by_many(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [1024 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [1024 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -1549,106 +1480,102 @@ define i32 @neg_off_by_many(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1656,24 +1583,23 @@ define i32 @neg_off_by_many(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
   %alloca = alloca [1024 x i32]
-  %base = bitcast [1024 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -1685,21 +1611,20 @@ loop_exit:
   ret i32 %accum.next
 }
 
-define i32 @neg_off_by_one_iteration(i64 %len, i1* %test_base) {
+define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) {
 ; CHECK-LABEL: @neg_off_by_one_iteration(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4095 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4095 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -1716,106 +1641,102 @@ define i32 @neg_off_by_one_iteration(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1823,24 +1744,23 @@ define i32 @neg_off_by_one_iteration(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
   %alloca = alloca [4095 x i32]
-  %base = bitcast [4095 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -1852,21 +1772,20 @@ loop_exit:
   ret i32 %accum.next
 }
 
-define i32 @neg_off_by_one_byte(i64 %len, i1* %test_base) {
+define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) {
 ; CHECK-LABEL: @neg_off_by_one_byte(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [16383 x i8], align 1
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [16383 x i8]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -1883,106 +1802,102 @@ define i32 @neg_off_by_one_byte(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1990,24 +1905,23 @@ define i32 @neg_off_by_one_byte(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
   %alloca = alloca [16383 x i8]
-  %base = bitcast [16383 x i8]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -2022,12 +1936,11 @@ loop_exit:
 
 ; Show that we handle case where exit count is non-constant, but that we
 ; have a constant bound on it which is sufficient to show dereferenceability.
-define i32 @test_constant_max(i64 %len, i1* %test_base) {
+define i32 @test_constant_max(i64 %len, ptr %test_base) {
 ; CHECK-LABEL: @test_constant_max(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[LEN:%.*]], 4094
 ; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[CMP]], i64 4094, i64 [[LEN]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[MIN]], 2
@@ -2039,10 +1952,10 @@ define i32 @test_constant_max(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP84:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 2
@@ -2059,106 +1972,102 @@ define i32 @test_constant_max(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = load i1, i1* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i1, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2
 ; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP44:%.*]] = load i1, i1* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP44:%.*]] = load i1, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2
 ; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = load i1, i1* [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP52:%.*]] = load i1, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2
 ; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = load i1, i1* [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP60:%.*]] = load i1, ptr [[TMP32]], align 1
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2
 ; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, i32* [[TMP65]], i32 0
-; CHECK-NEXT:    [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP70]], align 4
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, i32* [[TMP65]], i32 4
-; CHECK-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP72]], align 4
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr i32, i32* [[TMP65]], i32 8
-; CHECK-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr i32, i32* [[TMP65]], i32 12
-; CHECK-NEXT:    [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP76]], align 4
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP48]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP56]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP80:%.*]] = xor <4 x i1> [[TMP64]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP72]], align 4
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP48]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <4 x i1> [[TMP56]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP64]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP64]], <4 x i32> [[WIDE_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP84]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP77]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP78]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP79]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[TMP81:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP81]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP78]], [[TMP77]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP80]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP82]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -2166,13 +2075,12 @@ define i32 @test_constant_max(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], [[MIN]]
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP82]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
-  %base = bitcast [4096 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   %cmp = icmp ugt i64 %len, 4094
   %min = select i1 %cmp, i64 4094, i64 %len
   br label %loop
@@ -2180,12 +2088,12 @@ loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %alloca, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -2199,24 +2107,23 @@ loop_exit:
 
 
 ;; Model a custom allocate which allocates in chunks of 8 bytes
-declare align 8 dereferenceable_or_null(8) i8* @my_alloc(i32) allocsize(0)
-declare align 8 dereferenceable_or_null(8) i8* @my_array_alloc(i32, i32) allocsize(0, 1)
+declare align 8 dereferenceable_or_null(8) ptr @my_alloc(i32) allocsize(0)
+declare align 8 dereferenceable_or_null(8) ptr @my_array_alloc(i32, i32) allocsize(0, 1)
 
-define i32 @test_allocsize(i64 %len, i1* %test_base) nofree nosync {
+define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-LABEL: @test_allocsize(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCATION:%.*]] = call nonnull i8* @my_alloc(i32 16384)
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast i8* [[ALLOCATION]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    [[ALLOCATION:%.*]] = call nonnull ptr @my_alloc(i32 16384)
+; CHECK-NEXT:    call void @init(ptr [[ALLOCATION]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -2233,106 +2140,102 @@ define i32 @test_allocsize(i64 %len, i1* %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCATION]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -2340,24 +2243,23 @@ define i32 @test_allocsize(i64 %len, i1* %test_base) nofree nosync {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
-  %allocation = call nonnull i8* @my_alloc(i32 16384)
-  %base = bitcast i8* %allocation to i32*
-  call void @init(i32* %base)
+  %allocation = call nonnull ptr @my_alloc(i32 16384)
+  call void @init(ptr %allocation)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %allocation, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -2370,21 +2272,20 @@ loop_exit:
 }
 
 
-define i32 @test_allocsize_array(i64 %len, i1* %test_base) nofree nosync {
+define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-LABEL: @test_allocsize_array(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCATION:%.*]] = call nonnull i8* @my_array_alloc(i32 4096, i32 4)
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast i8* [[ALLOCATION]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    [[ALLOCATION:%.*]] = call nonnull ptr @my_array_alloc(i32 4096, i32 4)
+; CHECK-NEXT:    call void @init(ptr [[ALLOCATION]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -2401,106 +2302,102 @@ define i32 @test_allocsize_array(i64 %len, i1* %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCATION]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -2508,24 +2405,23 @@ define i32 @test_allocsize_array(i64 %len, i1* %test_base) nofree nosync {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
-  %allocation = call nonnull i8* @my_array_alloc(i32 4096, i32 4)
-  %base = bitcast i8* %allocation to i32*
-  call void @init(i32* %base)
+  %allocation = call nonnull ptr @my_array_alloc(i32 4096, i32 4)
+  call void @init(ptr %allocation)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %allocation, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]
@@ -2537,21 +2433,20 @@ loop_exit:
   ret i32 %accum.next
 }
 
-declare void @my_free(i8*)
+declare void @my_free(ptr)
 
 ; For the point in time variant of deref(N) semantics, show a negative
 ; example where hoisting without explicit predication might introduce a
 ; dynamic use after free.  (e.g. allzero is true when all elements of the
 ; test vector are false and thus base is never accessed.)
-define i32 @test_allocsize_cond_deref(i1 %allzero, i1* %test_base) {
+define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) {
 ; CHECK-LABEL: @test_allocsize_cond_deref(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCATION:%.*]] = call nonnull i8* @my_alloc(i32 16384)
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast i8* [[ALLOCATION]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    [[ALLOCATION:%.*]] = call nonnull ptr @my_alloc(i32 16384)
+; CHECK-NEXT:    call void @init(ptr [[ALLOCATION]])
 ; CHECK-NEXT:    br i1 [[ALLZERO:%.*]], label [[FREEIT:%.*]], label [[PREHEADER:%.*]]
 ; CHECK:       freeit:
-; CHECK-NEXT:    call void @my_free(i8* [[ALLOCATION]])
+; CHECK-NEXT:    call void @my_free(ptr [[ALLOCATION]])
 ; CHECK-NEXT:    br label [[PREHEADER]]
 ; CHECK:       preheader:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -2559,10 +2454,10 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, i1* %test_base) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP78:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP79:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -2579,106 +2474,102 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, ptr [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
-; CHECK-NEXT:    [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
-; CHECK-NEXT:    [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
-; CHECK-NEXT:    [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP76]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP77]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP78]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP79]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]]
-; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP77]], [[TMP76]]
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP78]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP79]], [[BIN_RDX10]]
+; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCATION]], i64 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -2686,16 +2577,15 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, i1* %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP81]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
-  %allocation = call nonnull i8* @my_alloc(i32 16384)
-  %base = bitcast i8* %allocation to i32*
-  call void @init(i32* %base)
+  %allocation = call nonnull ptr @my_alloc(i32 16384)
+  call void @init(ptr %allocation)
   br i1 %allzero, label %freeit, label %preheader
 freeit:
-  call void @my_free(i8* %allocation)
+  call void @my_free(ptr %allocation)
   br label %preheader
 preheader:
   br label %loop
@@ -2703,12 +2593,12 @@ loop:
   %iv = phi i64 [ 0, %preheader ], [ %iv.next, %latch ]
   %accum = phi i32 [ 0, %preheader ], [ %accum.next, %latch ]
   %iv.next = add i64 %iv, 1
-  %test_addr = getelementptr inbounds i1, i1* %test_base, i64 %iv
-  %earlycnd = load i1, i1* %test_addr
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i64 %iv
+  %earlycnd = load i1, ptr %test_addr
   br i1 %earlycnd, label %pred, label %latch
 pred:
-  %addr = getelementptr inbounds i32, i32* %base, i64 %iv
-  %val = load i32, i32* %addr
+  %addr = getelementptr inbounds i32, ptr %allocation, i64 %iv
+  %val = load i32, ptr %addr
   br label %latch
 latch:
   %val.phi = phi i32 [0, %loop], [%val, %pred]

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 7308afab0137..c03b0295f31d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -17,12 +17,12 @@ target triple = "x86_64-pc_linux"
 ;  }
 ;}
 
-define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
+define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
 ; AVX1-LABEL: @foo1(
 ; AVX1-NEXT:  entry:
-; AVX1-NEXT:    [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64
-; AVX1-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
-; AVX1-NEXT:    [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64
+; AVX1-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; AVX1-NEXT:    [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
+; AVX1-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; AVX1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX1:       vector.memcheck:
 ; AVX1-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
@@ -36,23 +36,20 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4
-; AVX1-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0
-; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x i32> poison)
-; AVX1-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0
-; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
-; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP10]], <8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP6]])
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
+; AVX1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -61,16 +58,16 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP15]], 100
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP12]], 100
 ; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX1:       if.then:
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
-; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
 ; AVX1-NEXT:    br label [[FOR_INC]]
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -81,9 +78,9 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ;
 ; AVX2-LABEL: @foo1(
 ; AVX2-NEXT:  entry:
-; AVX2-NEXT:    [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64
-; AVX2-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
-; AVX2-NEXT:    [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64
+; AVX2-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; AVX2-NEXT:    [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
+; AVX2-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; AVX2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX2:       vector.memcheck:
 ; AVX2-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
@@ -100,65 +97,53 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
 ; AVX2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 24
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8
-; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
-; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 24
-; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 4
-; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]]
-; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32* [[TMP22]], i32 0
-; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32* [[TMP22]], i32 8
-; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP22]], i32 16
-; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32* [[TMP22]], i32 24
-; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX2-NEXT:    [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
-; AVX2-NEXT:    [[TMP36:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
-; AVX2-NEXT:    [[TMP37:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
-; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]]
-; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP5]]
-; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32* [[TMP38]], i32 0
-; AVX2-NEXT:    [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP18]])
-; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32* [[TMP38]], i32 8
-; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP19]])
-; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32* [[TMP38]], i32 16
-; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP36]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP20]])
-; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32* [[TMP38]], i32 24
-; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP37]], <8 x i32>* [[TMP49]], i32 4, <8 x i1> [[TMP21]])
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8
+; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP12]], align 4
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 24
+; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4
+; AVX2-NEXT:    [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i32 8
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP18]], i32 16
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP18]], i32 24
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP25]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP26:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX2-NEXT:    [[TMP27:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
+; AVX2-NEXT:    [[TMP28:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
+; AVX2-NEXT:    [[TMP29:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[TMP30]], i32 0
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP26]], ptr [[TMP34]], i32 4, <8 x i1> [[TMP14]])
+; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[TMP30]], i32 8
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP27]], ptr [[TMP35]], i32 4, <8 x i1> [[TMP15]])
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[TMP30]], i32 16
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP28]], ptr [[TMP36]], i32 4, <8 x i1> [[TMP16]])
+; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr [[TMP30]], i32 24
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP29]], ptr [[TMP37]], i32 4, <8 x i1> [[TMP17]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX2-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX2-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -167,16 +152,16 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP51]], 100
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
 ; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX2:       if.then:
-; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP52:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP52]], [[TMP51]]
-; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP40]], [[TMP39]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
 ; AVX2-NEXT:    br label [[FOR_INC]]
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -187,9 +172,9 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ;
 ; AVX512-LABEL: @foo1(
 ; AVX512-NEXT:  iter.check:
-; AVX512-NEXT:    [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64
-; AVX512-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
-; AVX512-NEXT:    [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64
+; AVX512-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; AVX512-NEXT:    [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
+; AVX512-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX512:       vector.memcheck:
 ; AVX512-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
@@ -208,65 +193,53 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 48
-; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
-; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
-; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
-; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 32
-; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 48
-; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP17]], align 4
-; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]]
-; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]]
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32* [[TMP22]], i32 0
-; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32* [[TMP22]], i32 16
-; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP22]], i32 32
-; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32* [[TMP22]], i32 48
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX512-NEXT:    [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
-; AVX512-NEXT:    [[TMP36:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
-; AVX512-NEXT:    [[TMP37:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
-; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]]
-; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP5]]
-; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32* [[TMP38]], i32 0
-; AVX512-NEXT:    [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP18]])
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32* [[TMP38]], i32 16
-; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP19]])
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32* [[TMP38]], i32 32
-; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP36]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP20]])
-; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32* [[TMP38]], i32 48
-; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP37]], <16 x i32>* [[TMP49]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16
+; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 32
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP12]], align 4
+; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 48
+; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP13]], align 4
+; AVX512-NEXT:    [[TMP14:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP15:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i32 16
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP18]], i32 32
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP18]], i32 48
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP25]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP26:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP27:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
+; AVX512-NEXT:    [[TMP28:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
+; AVX512-NEXT:    [[TMP29:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[TMP30]], i32 0
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP26]], ptr [[TMP34]], i32 4, <16 x i1> [[TMP14]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[TMP30]], i32 16
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP27]], ptr [[TMP35]], i32 4, <16 x i1> [[TMP15]])
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[TMP30]], i32 32
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP28]], ptr [[TMP36]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr [[TMP30]], i32 48
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP29]], ptr [[TMP37]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
-; AVX512-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX512-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -276,25 +249,22 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; AVX512:       vec.epilog.vector.body:
-; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 0
-; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP51]]
-; AVX512-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 0
-; AVX512-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP54]], align 4
-; AVX512-NEXT:    [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP51]]
-; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr i32, i32* [[TMP56]], i32 0
-; AVX512-NEXT:    [[TMP58:%.*]] = bitcast i32* [[TMP57]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP58]], i32 4, <8 x i1> [[TMP55]], <8 x i32> poison)
-; AVX512-NEXT:    [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]]
-; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP51]]
-; AVX512-NEXT:    [[TMP61:%.*]] = getelementptr i32, i32* [[TMP60]], i32 0
-; AVX512-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <8 x i32>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP59]], <8 x i32>* [[TMP62]], i32 4, <8 x i1> [[TMP55]])
-; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8
-; AVX512-NEXT:    [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
-; AVX512-NEXT:    br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; AVX512-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP39:%.*]] = add i64 [[INDEX12]], 0
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP39]]
+; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP40]], i32 0
+; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP41]], align 4
+; AVX512-NEXT:    [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP39]]
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr [[TMP43]], i32 0
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP44]], i32 4, <8 x i1> [[TMP42]], <8 x i32> poison)
+; AVX512-NEXT:    [[TMP45:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]]
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP39]]
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[TMP46]], i32 0
+; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP45]], ptr [[TMP47]], i32 4, <8 x i1> [[TMP42]])
+; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
+; AVX512-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
+; AVX512-NEXT:    br i1 [[TMP48]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
 ; AVX512-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 10000, 10000
 ; AVX512-NEXT:    br i1 [[CMP_N11]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -303,16 +273,16 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP64:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP64]], 100
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP49:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
-; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP65]], [[TMP64]]
-; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP50:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -326,17 +296,17 @@ entry:
 
 for.body:                                         ; preds = %for.inc, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
   %cmp1 = icmp slt i32 %0, 100
   br i1 %cmp1, label %if.then, label %for.inc
 
 if.then:                                          ; preds = %for.body
-  %arrayidx3 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx3, align 4
   %add = add nsw i32 %1, %0
-  %arrayidx7 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx7, align 4
+  %arrayidx7 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx7, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
@@ -350,12 +320,12 @@ for.end:                                          ; preds = %for.inc
 
 ; The same as @foo1 but all the pointers are address space 1 pointers.
 
-define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* nocapture readonly %B, i32 addrspace(1)* nocapture readonly %trigger) local_unnamed_addr #0 {
+define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture readonly %trigger) local_unnamed_addr #0 {
 ; AVX1-LABEL: @foo1_addrspace1(
 ; AVX1-NEXT:  entry:
-; AVX1-NEXT:    [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64
-; AVX1-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64
-; AVX1-NEXT:    [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64
+; AVX1-NEXT:    [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64
+; AVX1-NEXT:    [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64
+; AVX1-NEXT:    [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
 ; AVX1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX1:       vector.memcheck:
 ; AVX1-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
@@ -369,23 +339,20 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP3]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)*
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4
-; AVX1-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP7]], i32 0
-; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x i32> poison)
-; AVX1-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP11]], i32 0
-; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
-; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP10]], <8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP6]])
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP4]], align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP6]], i32 0
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
+; AVX1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP9]], i32 0
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP8]], ptr addrspace(1) [[TMP10]], i32 4, <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -394,16 +361,16 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP15]], 100
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP12]], 100
 ; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX1:       if.then:
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP16:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
-; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4
 ; AVX1-NEXT:    br label [[FOR_INC]]
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -414,9 +381,9 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ;
 ; AVX2-LABEL: @foo1_addrspace1(
 ; AVX2-NEXT:  entry:
-; AVX2-NEXT:    [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64
-; AVX2-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64
-; AVX2-NEXT:    [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64
+; AVX2-NEXT:    [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64
+; AVX2-NEXT:    [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64
+; AVX2-NEXT:    [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
 ; AVX2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX2:       vector.memcheck:
 ; AVX2-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
@@ -433,65 +400,53 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
 ; AVX2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 24
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP4]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP5]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 8
-; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 16
-; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 24
-; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP17]], align 4
-; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP4]]
-; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP5]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 0
-; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 8
-; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 16
-; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 24
-; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX2-NEXT:    [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
-; AVX2-NEXT:    [[TMP36:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
-; AVX2-NEXT:    [[TMP37:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
-; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP4]]
-; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP5]]
-; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 0
-; AVX2-NEXT:    [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP18]])
-; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 8
-; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP19]])
-; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 16
-; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP36]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP20]])
-; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 24
-; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP37]], <8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP21]])
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 0
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP10]], align 4
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 8
+; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP11]], align 4
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 16
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP12]], align 4
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 24
+; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP13]], align 4
+; AVX2-NEXT:    [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 0
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 8
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 16
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 24
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP25]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP26:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX2-NEXT:    [[TMP27:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
+; AVX2-NEXT:    [[TMP28:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
+; AVX2-NEXT:    [[TMP29:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 0
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP26]], ptr addrspace(1) [[TMP34]], i32 4, <8 x i1> [[TMP14]])
+; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 8
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP27]], ptr addrspace(1) [[TMP35]], i32 4, <8 x i1> [[TMP15]])
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 16
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP28]], ptr addrspace(1) [[TMP36]], i32 4, <8 x i1> [[TMP16]])
+; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 24
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP29]], ptr addrspace(1) [[TMP37]], i32 4, <8 x i1> [[TMP17]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX2-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX2-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -500,16 +455,16 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP51]], 100
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
 ; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX2:       if.then:
-; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP52:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
-; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP52]], [[TMP51]]
-; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP40]], [[TMP39]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4
 ; AVX2-NEXT:    br label [[FOR_INC]]
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -520,9 +475,9 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ;
 ; AVX512-LABEL: @foo1_addrspace1(
 ; AVX512-NEXT:  iter.check:
-; AVX512-NEXT:    [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64
-; AVX512-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64
-; AVX512-NEXT:    [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64
+; AVX512-NEXT:    [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64
+; AVX512-NEXT:    [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64
+; AVX512-NEXT:    [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
 ; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX512:       vector.memcheck:
 ; AVX512-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
@@ -541,65 +496,53 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 48
-; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP4]]
-; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP5]]
-; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 0
-; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 16
-; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 32
-; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 48
-; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP17]], align 4
-; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP4]]
-; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP5]]
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 0
-; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 16
-; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 32
-; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 48
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX512-NEXT:    [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
-; AVX512-NEXT:    [[TMP36:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
-; AVX512-NEXT:    [[TMP37:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
-; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP4]]
-; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP5]]
-; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 0
-; AVX512-NEXT:    [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP18]])
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 16
-; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP19]])
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 32
-; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP36]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP20]])
-; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 48
-; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP37]], <16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 0
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP10]], align 4
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 16
+; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP11]], align 4
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 32
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP12]], align 4
+; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 48
+; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP13]], align 4
+; AVX512-NEXT:    [[TMP14:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP15:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 0
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 16
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 32
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 48
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP25]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP26:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP27:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
+; AVX512-NEXT:    [[TMP28:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
+; AVX512-NEXT:    [[TMP29:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 0
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP26]], ptr addrspace(1) [[TMP34]], i32 4, <16 x i1> [[TMP14]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 16
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP27]], ptr addrspace(1) [[TMP35]], i32 4, <16 x i1> [[TMP15]])
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 32
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP28]], ptr addrspace(1) [[TMP36]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 48
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP29]], ptr addrspace(1) [[TMP37]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
-; AVX512-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; AVX512-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -609,25 +552,22 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; AVX512:       vec.epilog.vector.body:
-; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 0
-; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP51]]
-; AVX512-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP52]], i32 0
-; AVX512-NEXT:    [[TMP54:%.*]] = bitcast i32 addrspace(1)* [[TMP53]] to <8 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP54]], align 4
-; AVX512-NEXT:    [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP51]]
-; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i32 0
-; AVX512-NEXT:    [[TMP58:%.*]] = bitcast i32 addrspace(1)* [[TMP57]] to <8 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP58]], i32 4, <8 x i1> [[TMP55]], <8 x i32> poison)
-; AVX512-NEXT:    [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]]
-; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP51]]
-; AVX512-NEXT:    [[TMP61:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP60]], i32 0
-; AVX512-NEXT:    [[TMP62:%.*]] = bitcast i32 addrspace(1)* [[TMP61]] to <8 x i32> addrspace(1)*
-; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP59]], <8 x i32> addrspace(1)* [[TMP62]], i32 4, <8 x i1> [[TMP55]])
-; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8
-; AVX512-NEXT:    [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
-; AVX512-NEXT:    br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; AVX512-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP39:%.*]] = add i64 [[INDEX12]], 0
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP39]]
+; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP40]], i32 0
+; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP41]], align 4
+; AVX512-NEXT:    [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP39]]
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP43]], i32 0
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP44]], i32 4, <8 x i1> [[TMP42]], <8 x i32> poison)
+; AVX512-NEXT:    [[TMP45:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]]
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP39]]
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP46]], i32 0
+; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP45]], ptr addrspace(1) [[TMP47]], i32 4, <8 x i1> [[TMP42]])
+; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
+; AVX512-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
+; AVX512-NEXT:    br i1 [[TMP48]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
 ; AVX512-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 10000, 10000
 ; AVX512-NEXT:    br i1 [[CMP_N11]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -636,16 +576,16 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP64:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP64]], 100
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
-; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
-; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP65]], [[TMP64]]
-; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -659,17 +599,17 @@ entry:
 
 for.body:                                         ; preds = %for.inc, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %trigger, i64 %indvars.iv
-  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %trigger, i64 %indvars.iv
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
   %cmp1 = icmp slt i32 %0, 100
   br i1 %cmp1, label %if.then, label %for.inc
 
 if.then:                                          ; preds = %for.body
-  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %indvars.iv
-  %1 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %B, i64 %indvars.iv
+  %1 = load i32, ptr addrspace(1) %arrayidx3, align 4
   %add = add nsw i32 %1, %0
-  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %indvars.iv
-  store i32 %add, i32 addrspace(1)* %arrayidx7, align 4
+  %arrayidx7 = getelementptr inbounds i32, ptr addrspace(1) %A, i64 %indvars.iv
+  store i32 %add, ptr addrspace(1) %arrayidx7, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
@@ -683,7 +623,7 @@ for.end:                                          ; preds = %for.inc
 
 ; The source code:
 ;
-;void foo2(float *A, float *B, int *trigger) {
+;void foo2(ptr A, ptr B, int *trigger) {
 ;
 ;  for (int i=0; i<10000; i++) {
 ;    if (trigger[i] < 100) {
@@ -692,12 +632,12 @@ for.end:                                          ; preds = %for.inc
 ;  }
 ;}
 
-define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
+define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
 ; AVX1-LABEL: @foo2(
 ; AVX1-NEXT:  entry:
-; AVX1-NEXT:    [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64
-; AVX1-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
-; AVX1-NEXT:    [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64
+; AVX1-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; AVX1-NEXT:    [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
+; AVX1-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; AVX1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX1:       vector.memcheck:
 ; AVX1-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
@@ -711,24 +651,21 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4
-; AVX1-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[TMP7]], i32 0
-; AVX1-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x float> poison)
-; AVX1-NEXT:    [[TMP10:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
-; AVX1-NEXT:    [[TMP11:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP10]]
-; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr float, float* [[TMP12]], i32 0
-; AVX1-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>*
-; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP11]], <8 x float>* [[TMP14]], i32 4, <8 x i1> [[TMP6]])
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr float, ptr [[TMP6]], i32 0
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x float> poison)
+; AVX1-NEXT:    [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
+; AVX1-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]]
+; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP9]], ptr [[TMP11]], i32 4, <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; AVX1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; AVX1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -737,17 +674,17 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP16]], 100
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP13]], 100
 ; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX1:       if.then:
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP16]] to float
-; AVX1-NEXT:    [[ADD:%.*]] = fadd float [[TMP17]], [[CONV]]
-; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to float
+; AVX1-NEXT:    [[ADD:%.*]] = fadd float [[TMP14]], [[CONV]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store float [[ADD]], ptr [[ARRAYIDX7]], align 4
 ; AVX1-NEXT:    br label [[FOR_INC]]
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -758,9 +695,9 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ;
 ; AVX2-LABEL: @foo2(
 ; AVX2-NEXT:  entry:
-; AVX2-NEXT:    [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64
-; AVX2-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
-; AVX2-NEXT:    [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64
+; AVX2-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; AVX2-NEXT:    [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
+; AVX2-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; AVX2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX2:       vector.memcheck:
 ; AVX2-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
@@ -777,69 +714,57 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
 ; AVX2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 24
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8
-; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
-; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 24
-; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 4
-; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[B]], i64 [[TMP4]]
-; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[B]], i64 [[TMP5]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr float, float* [[TMP22]], i32 0
-; AVX2-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison)
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr float, float* [[TMP22]], i32 8
-; AVX2-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison)
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr float, float* [[TMP22]], i32 16
-; AVX2-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x float> poison)
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr float, float* [[TMP22]], i32 24
-; AVX2-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x float> poison)
-; AVX2-NEXT:    [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
-; AVX2-NEXT:    [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float>
-; AVX2-NEXT:    [[TMP36:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float>
-; AVX2-NEXT:    [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x float>
-; AVX2-NEXT:    [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP34]]
-; AVX2-NEXT:    [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD8]], [[TMP35]]
-; AVX2-NEXT:    [[TMP40:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD9]], [[TMP36]]
-; AVX2-NEXT:    [[TMP41:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD10]], [[TMP37]]
-; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]]
-; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr float, float* [[A]], i64 [[TMP5]]
-; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr float, float* [[TMP42]], i32 0
-; AVX2-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP18]])
-; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr float, float* [[TMP42]], i32 8
-; AVX2-NEXT:    [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP19]])
-; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr float, float* [[TMP42]], i32 16
-; AVX2-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP40]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP20]])
-; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr float, float* [[TMP42]], i32 24
-; AVX2-NEXT:    [[TMP53:%.*]] = bitcast float* [[TMP52]] to <8 x float>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP41]], <8 x float>* [[TMP53]], i32 4, <8 x i1> [[TMP21]])
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8
+; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP12]], align 4
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 24
+; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4
+; AVX2-NEXT:    [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr float, ptr [[TMP18]], i32 0
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x float> poison)
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr float, ptr [[TMP18]], i32 8
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x float> poison)
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr float, ptr [[TMP18]], i32 16
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison)
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr float, ptr [[TMP18]], i32 24
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP25]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison)
+; AVX2-NEXT:    [[TMP26:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
+; AVX2-NEXT:    [[TMP27:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float>
+; AVX2-NEXT:    [[TMP28:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float>
+; AVX2-NEXT:    [[TMP29:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x float>
+; AVX2-NEXT:    [[TMP30:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP26]]
+; AVX2-NEXT:    [[TMP31:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD8]], [[TMP27]]
+; AVX2-NEXT:    [[TMP32:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD9]], [[TMP28]]
+; AVX2-NEXT:    [[TMP33:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD10]], [[TMP29]]
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr float, ptr [[TMP34]], i32 0
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP30]], ptr [[TMP38]], i32 4, <8 x i1> [[TMP14]])
+; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[TMP34]], i32 8
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP31]], ptr [[TMP39]], i32 4, <8 x i1> [[TMP15]])
+; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr float, ptr [[TMP34]], i32 16
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP32]], ptr [[TMP40]], i32 4, <8 x i1> [[TMP16]])
+; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr float, ptr [[TMP34]], i32 24
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP33]], ptr [[TMP41]], i32 4, <8 x i1> [[TMP17]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX2-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; AVX2-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX2-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -848,17 +773,17 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP55:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP55]], 100
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP43]], 100
 ; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX2:       if.then:
-; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP56:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP55]] to float
-; AVX2-NEXT:    [[ADD:%.*]] = fadd float [[TMP56]], [[CONV]]
-; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP44:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP43]] to float
+; AVX2-NEXT:    [[ADD:%.*]] = fadd float [[TMP44]], [[CONV]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store float [[ADD]], ptr [[ARRAYIDX7]], align 4
 ; AVX2-NEXT:    br label [[FOR_INC]]
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -869,9 +794,9 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ;
 ; AVX512-LABEL: @foo2(
 ; AVX512-NEXT:  iter.check:
-; AVX512-NEXT:    [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64
-; AVX512-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
-; AVX512-NEXT:    [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64
+; AVX512-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; AVX512-NEXT:    [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
+; AVX512-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX512:       vector.memcheck:
 ; AVX512-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
@@ -890,69 +815,57 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 48
-; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
-; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
-; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
-; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 32
-; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 48
-; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP17]], align 4
-; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[B]], i64 [[TMP4]]
-; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[B]], i64 [[TMP5]]
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr float, float* [[TMP22]], i32 0
-; AVX512-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison)
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr float, float* [[TMP22]], i32 16
-; AVX512-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison)
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr float, float* [[TMP22]], i32 32
-; AVX512-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x float> poison)
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr float, float* [[TMP22]], i32 48
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x float> poison)
-; AVX512-NEXT:    [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
-; AVX512-NEXT:    [[TMP35:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float>
-; AVX512-NEXT:    [[TMP36:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float>
-; AVX512-NEXT:    [[TMP37:%.*]] = sitofp <16 x i32> [[WIDE_LOAD7]] to <16 x float>
-; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP34]]
-; AVX512-NEXT:    [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD8]], [[TMP35]]
-; AVX512-NEXT:    [[TMP40:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD9]], [[TMP36]]
-; AVX512-NEXT:    [[TMP41:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD10]], [[TMP37]]
-; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]]
-; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr float, float* [[A]], i64 [[TMP5]]
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr float, float* [[TMP42]], i32 0
-; AVX512-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP18]])
-; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr float, float* [[TMP42]], i32 16
-; AVX512-NEXT:    [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP19]])
-; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr float, float* [[TMP42]], i32 32
-; AVX512-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP40]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP20]])
-; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr float, float* [[TMP42]], i32 48
-; AVX512-NEXT:    [[TMP53:%.*]] = bitcast float* [[TMP52]] to <16 x float>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP41]], <16 x float>* [[TMP53]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16
+; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 32
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP12]], align 4
+; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 48
+; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP13]], align 4
+; AVX512-NEXT:    [[TMP14:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP15:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr float, ptr [[TMP18]], i32 0
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x float> poison)
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr float, ptr [[TMP18]], i32 16
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x float> poison)
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr float, ptr [[TMP18]], i32 32
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison)
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr float, ptr [[TMP18]], i32 48
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP25]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison)
+; AVX512-NEXT:    [[TMP26:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
+; AVX512-NEXT:    [[TMP27:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float>
+; AVX512-NEXT:    [[TMP28:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float>
+; AVX512-NEXT:    [[TMP29:%.*]] = sitofp <16 x i32> [[WIDE_LOAD7]] to <16 x float>
+; AVX512-NEXT:    [[TMP30:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP26]]
+; AVX512-NEXT:    [[TMP31:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD8]], [[TMP27]]
+; AVX512-NEXT:    [[TMP32:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD9]], [[TMP28]]
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD10]], [[TMP29]]
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr float, ptr [[TMP34]], i32 0
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP30]], ptr [[TMP38]], i32 4, <16 x i1> [[TMP14]])
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[TMP34]], i32 16
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP31]], ptr [[TMP39]], i32 4, <16 x i1> [[TMP15]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr float, ptr [[TMP34]], i32 32
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP32]], ptr [[TMP40]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr float, ptr [[TMP34]], i32 48
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP33]], ptr [[TMP41]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
-; AVX512-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; AVX512-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -962,26 +875,23 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; AVX512:       vec.epilog.vector.body:
-; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP55:%.*]] = add i64 [[OFFSET_IDX]], 0
-; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP55]]
-; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i32 0
-; AVX512-NEXT:    [[TMP58:%.*]] = bitcast i32* [[TMP57]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP58]], align 4
-; AVX512-NEXT:    [[TMP59:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr float, float* [[B]], i64 [[TMP55]]
-; AVX512-NEXT:    [[TMP61:%.*]] = getelementptr float, float* [[TMP60]], i32 0
-; AVX512-NEXT:    [[TMP62:%.*]] = bitcast float* [[TMP61]] to <8 x float>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP62]], i32 4, <8 x i1> [[TMP59]], <8 x float> poison)
-; AVX512-NEXT:    [[TMP63:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float>
-; AVX512-NEXT:    [[TMP64:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD14]], [[TMP63]]
-; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr float, float* [[A]], i64 [[TMP55]]
-; AVX512-NEXT:    [[TMP66:%.*]] = getelementptr float, float* [[TMP65]], i32 0
-; AVX512-NEXT:    [[TMP67:%.*]] = bitcast float* [[TMP66]] to <8 x float>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP64]], <8 x float>* [[TMP67]], i32 4, <8 x i1> [[TMP59]])
-; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8
-; AVX512-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
-; AVX512-NEXT:    br i1 [[TMP68]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; AVX512-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP43:%.*]] = add i64 [[INDEX12]], 0
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP43]]
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 0
+; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP45]], align 4
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP43]]
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr float, ptr [[TMP47]], i32 0
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP48]], i32 4, <8 x i1> [[TMP46]], <8 x float> poison)
+; AVX512-NEXT:    [[TMP49:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float>
+; AVX512-NEXT:    [[TMP50:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD14]], [[TMP49]]
+; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP43]]
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr float, ptr [[TMP51]], i32 0
+; AVX512-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP50]], ptr [[TMP52]], i32 4, <8 x i1> [[TMP46]])
+; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
+; AVX512-NEXT:    [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
+; AVX512-NEXT:    br i1 [[TMP53]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
 ; AVX512-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 10000, 10000
 ; AVX512-NEXT:    br i1 [[CMP_N11]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -990,17 +900,17 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP54:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP54]], 100
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
-; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP70:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP69]] to float
-; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP70]], [[CONV]]
-; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP55:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP54]] to float
+; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP55]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store float [[ADD]], ptr [[ARRAYIDX7]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -1014,18 +924,18 @@ entry:
 
 for.body:                                         ; preds = %for.inc, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
   %cmp1 = icmp slt i32 %0, 100
   br i1 %cmp1, label %if.then, label %for.inc
 
 if.then:                                          ; preds = %for.body
-  %arrayidx3 = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  %1 = load float, float* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds float, ptr %B, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx3, align 4
   %conv = sitofp i32 %0 to float
   %add = fadd float %1, %conv
-  %arrayidx7 = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  store float %add, float* %arrayidx7, align 4
+  %arrayidx7 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
+  store float %add, ptr %arrayidx7, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
@@ -1039,7 +949,7 @@ for.end:                                          ; preds = %for.inc
 
 ; The source code:
 ;
-;void foo3(double *A, double *B, int *trigger) {
+;void foo3(ptr A, ptr B, int *trigger) {
 ;
 ;  for (int i=0; i<10000; i++) {
 ;    if (trigger[i] < 100) {
@@ -1048,27 +958,21 @@ for.end:                                          ; preds = %for.inc
 ;  }
 ;}
 
-define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
+define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
 ; AVX-LABEL: @foo3(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
-; AVX-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
-; AVX-NEXT:    [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
 ; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX:       vector.memcheck:
-; AVX-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 10000
-; AVX-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; AVX-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
-; AVX-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; AVX-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 10000
-; AVX-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; AVX-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; AVX-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
+; AVX-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000
+; AVX-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000
+; AVX-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000
+; AVX-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]]
+; AVX-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]]
 ; AVX-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; AVX-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
-; AVX-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
-; AVX-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; AVX-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; AVX-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]]
+; AVX-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]]
+; AVX-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; AVX-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; AVX-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; AVX:       vector.ph:
 ; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1078,69 +982,57 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; AVX-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
 ; AVX-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !7
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; AVX-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !7
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; AVX-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; AVX-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !7
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
-; AVX-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; AVX-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !7
-; AVX-NEXT:    [[TMP16:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
-; AVX-NEXT:    [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100>
-; AVX-NEXT:    [[TMP18:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100>
-; AVX-NEXT:    [[TMP19:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100>
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr double, double* [[B]], i64 [[TMP0]]
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr double, double* [[B]], i64 [[TMP2]]
-; AVX-NEXT:    [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]]
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0
-; AVX-NEXT:    [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !10
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 4
-; AVX-NEXT:    [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !10
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 8
-; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !10
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 12
-; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
-; AVX-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !10
-; AVX-NEXT:    [[TMP32:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
-; AVX-NEXT:    [[TMP33:%.*]] = sitofp <4 x i32> [[WIDE_LOAD12]] to <4 x double>
-; AVX-NEXT:    [[TMP34:%.*]] = sitofp <4 x i32> [[WIDE_LOAD13]] to <4 x double>
-; AVX-NEXT:    [[TMP35:%.*]] = sitofp <4 x i32> [[WIDE_LOAD14]] to <4 x double>
-; AVX-NEXT:    [[TMP36:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP32]]
-; AVX-NEXT:    [[TMP37:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]]
-; AVX-NEXT:    [[TMP38:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]]
-; AVX-NEXT:    [[TMP39:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]]
-; AVX-NEXT:    [[TMP40:%.*]] = getelementptr double, double* [[A]], i64 [[TMP0]]
-; AVX-NEXT:    [[TMP41:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]]
-; AVX-NEXT:    [[TMP42:%.*]] = getelementptr double, double* [[A]], i64 [[TMP2]]
-; AVX-NEXT:    [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]]
-; AVX-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0
-; AVX-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !12, !noalias !14
-; AVX-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 4
-; AVX-NEXT:    [[TMP47:%.*]] = bitcast double* [[TMP46]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !12, !noalias !14
-; AVX-NEXT:    [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 8
-; AVX-NEXT:    [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !12, !noalias !14
-; AVX-NEXT:    [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 12
-; AVX-NEXT:    [[TMP51:%.*]] = bitcast double* [[TMP50]] to <4 x double>*
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !12, !noalias !14
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope !7
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
+; AVX-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !7
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
+; AVX-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope !7
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
+; AVX-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope !7
+; AVX-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
+; AVX-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100>
+; AVX-NEXT:    [[TMP14:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100>
+; AVX-NEXT:    [[TMP15:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], <i32 100, i32 100, i32 100, i32 100>
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]]
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP1]]
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP2]]
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP3]]
+; AVX-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[TMP16]], i32 0
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP12]], <4 x double> poison), !alias.scope !10
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP16]], i32 4
+; AVX-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> poison), !alias.scope !10
+; AVX-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[TMP16]], i32 8
+; AVX-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP14]], <4 x double> poison), !alias.scope !10
+; AVX-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[TMP16]], i32 12
+; AVX-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double> poison), !alias.scope !10
+; AVX-NEXT:    [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
+; AVX-NEXT:    [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double>
+; AVX-NEXT:    [[TMP26:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double>
+; AVX-NEXT:    [[TMP27:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double>
+; AVX-NEXT:    [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP24]]
+; AVX-NEXT:    [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP25]]
+; AVX-NEXT:    [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP26]]
+; AVX-NEXT:    [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP27]]
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]]
+; AVX-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP1]]
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP2]]
+; AVX-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
+; AVX-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP32]], i32 0
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP28]], ptr [[TMP36]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !12, !noalias !14
+; AVX-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP32]], i32 4
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP29]], ptr [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !12, !noalias !14
+; AVX-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP32]], i32 8
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP30]], ptr [[TMP38]], i32 8, <4 x i1> [[TMP14]]), !alias.scope !12, !noalias !14
+; AVX-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP32]], i32 12
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP31]], ptr [[TMP39]], i32 8, <4 x i1> [[TMP15]]), !alias.scope !12, !noalias !14
 ; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; AVX-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; AVX:       middle.block:
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1149,17 +1041,17 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX:       for.body:
 ; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AVX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP41]], 100
 ; AVX-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX:       if.then:
-; AVX-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP54:%.*]] = load double, double* [[ARRAYIDX3]], align 8
-; AVX-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP53]] to double
-; AVX-NEXT:    [[ADD:%.*]] = fadd double [[TMP54]], [[CONV]]
-; AVX-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP42:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
+; AVX-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX-NEXT:    [[ADD:%.*]] = fadd double [[TMP42]], [[CONV]]
+; AVX-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    store double [[ADD]], ptr [[ARRAYIDX7]], align 8
 ; AVX-NEXT:    br label [[FOR_INC]]
 ; AVX:       for.inc:
 ; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -1170,24 +1062,18 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ;
 ; AVX512-LABEL: @foo3(
 ; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
-; AVX512-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
-; AVX512-NEXT:    [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
 ; AVX512-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX512:       vector.memcheck:
-; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 10000
-; AVX512-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; AVX512-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
-; AVX512-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; AVX512-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 10000
-; AVX512-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
+; AVX512-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000
+; AVX512-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000
+; AVX512-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]]
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]]
 ; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; AVX512-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
-; AVX512-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
-; AVX512-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; AVX512-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]]
+; AVX512-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]]
+; AVX512-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; AVX512:       vector.ph:
 ; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1197,69 +1083,57 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !11
-; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !11
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
-; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !11
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24
-; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !11
-; AVX512-NEXT:    [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr double, double* [[B]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr double, double* [[B]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0
-; AVX512-NEXT:    [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> poison), !alias.scope !14
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 8
-; AVX512-NEXT:    [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> poison), !alias.scope !14
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 16
-; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> poison), !alias.scope !14
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 24
-; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> poison), !alias.scope !14
-; AVX512-NEXT:    [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
-; AVX512-NEXT:    [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x double>
-; AVX512-NEXT:    [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double>
-; AVX512-NEXT:    [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD14]] to <8 x double>
-; AVX512-NEXT:    [[TMP36:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP32]]
-; AVX512-NEXT:    [[TMP37:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]]
-; AVX512-NEXT:    [[TMP38:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]]
-; AVX512-NEXT:    [[TMP39:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]]
-; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr double, double* [[A]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr double, double* [[A]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0
-; AVX512-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias !18
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 8
-; AVX512-NEXT:    [[TMP47:%.*]] = bitcast double* [[TMP46]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18
-; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 16
-; AVX512-NEXT:    [[TMP49:%.*]] = bitcast double* [[TMP48]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18
-; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 24
-; AVX512-NEXT:    [[TMP51:%.*]] = bitcast double* [[TMP50]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16
+; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 24
+; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD8]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[TMP16]], i32 0
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP12]], <8 x double> poison), !alias.scope !14
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP16]], i32 8
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> poison), !alias.scope !14
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[TMP16]], i32 16
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP14]], <8 x double> poison), !alias.scope !14
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[TMP16]], i32 24
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double> poison), !alias.scope !14
+; AVX512-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
+; AVX512-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x double>
+; AVX512-NEXT:    [[TMP26:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x double>
+; AVX512-NEXT:    [[TMP27:%.*]] = sitofp <8 x i32> [[WIDE_LOAD8]] to <8 x double>
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP24]]
+; AVX512-NEXT:    [[TMP29:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD9]], [[TMP25]]
+; AVX512-NEXT:    [[TMP30:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD10]], [[TMP26]]
+; AVX512-NEXT:    [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD11]], [[TMP27]]
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP32]], i32 0
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP28]], ptr [[TMP36]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP32]], i32 8
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP29]], ptr [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP32]], i32 16
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP30]], ptr [[TMP38]], i32 8, <8 x i1> [[TMP14]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP32]], i32 24
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP31]], ptr [[TMP39]], i32 8, <8 x i1> [[TMP15]]), !alias.scope !16, !noalias !18
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1268,17 +1142,17 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP41]], 100
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
-; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP54:%.*]] = load double, double* [[ARRAYIDX3]], align 8
-; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP53]] to double
-; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP54]], [[CONV]]
-; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP42:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP42]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], ptr [[ARRAYIDX7]], align 8
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -1292,18 +1166,18 @@ entry:
 
 for.body:                                         ; preds = %for.inc, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
   %cmp1 = icmp slt i32 %0, 100
   br i1 %cmp1, label %if.then, label %for.inc
 
 if.then:                                          ; preds = %for.body
-  %arrayidx3 = getelementptr inbounds double, double* %B, i64 %indvars.iv
-  %1 = load double, double* %arrayidx3, align 8
+  %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx3, align 8
   %conv = sitofp i32 %0 to double
   %add = fadd double %1, %conv
-  %arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv
-  store double %add, double* %arrayidx7, align 8
+  %arrayidx7 = getelementptr inbounds double, ptr %A, i64 %indvars.iv
+  store double %add, ptr %arrayidx7, align 8
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
@@ -1317,7 +1191,7 @@ for.end:                                          ; preds = %for.inc
 
 ; The source code:
 ;
-;void foo4(double *A, double *B, int *trigger) {
+;void foo4(ptr A, ptr B, int *trigger) {
 ;
 ;  for (int i=0; i<10000; i += 16) {
 ;    if (trigger[i] < 100) {
@@ -1326,24 +1200,24 @@ for.end:                                          ; preds = %for.inc
 ;  }
 ;}
 
-define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
+define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
 ; AVX-LABEL: @foo4(
 ; AVX-NEXT:  entry:
 ; AVX-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX:       for.body:
 ; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; AVX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
 ; AVX-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX:       if.then:
 ; AVX-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; AVX-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
-; AVX-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 [[TMP1]]
+; AVX-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
 ; AVX-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
 ; AVX-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
-; AVX-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    store double [[ADD]], ptr [[ARRAYIDX7]], align 8
 ; AVX-NEXT:    br label [[FOR_INC]]
 ; AVX:       for.inc:
 ; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
@@ -1354,40 +1228,34 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ;
 ; AVX512-LABEL: @foo4(
 ; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
-; AVX512-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
-; AVX512-NEXT:    [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
 ; AVX512-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX512:       vector.memcheck:
-; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985
-; AVX512-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; AVX512-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 9985
-; AVX512-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; AVX512-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 19969
-; AVX512-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
+; AVX512-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
+; AVX512-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
+; AVX512-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]]
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]]
 ; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; AVX512-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
-; AVX512-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
-; AVX512-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; AVX512-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]]
+; AVX512-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]]
+; AVX512-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; AVX512:       vector.ph:
 ; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison), !alias.scope !21
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <8 x i64> [[VEC_IND]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison), !alias.scope !21
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX512-NEXT:    [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP2]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope !24
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[B]], <8 x i64> [[TMP2]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope !24
 ; AVX512-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
-; AVX512-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]]
-; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP5]], <8 x double*> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER6]], [[TMP4]]
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], <8 x i64> [[VEC_IND]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !26, !noalias !28
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
 ; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
@@ -1400,18 +1268,18 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP8]], 100
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP9]]
-; AVX512-NEXT:    [[TMP10:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP9]]
+; AVX512-NEXT:    [[TMP10:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
 ; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP8]] to double
 ; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP10]], [[CONV]]
-; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], ptr [[ARRAYIDX7]], align 8
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
@@ -1425,19 +1293,19 @@ entry:
 
 for.body:                                         ; preds = %entry, %for.inc
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
   %cmp1 = icmp slt i32 %0, 100
   br i1 %cmp1, label %if.then, label %for.inc
 
 if.then:                                          ; preds = %for.body
   %1 = shl nuw nsw i64 %indvars.iv, 1
-  %arrayidx3 = getelementptr inbounds double, double* %B, i64 %1
-  %2 = load double, double* %arrayidx3, align 8
+  %arrayidx3 = getelementptr inbounds double, ptr %B, i64 %1
+  %2 = load double, ptr %arrayidx3, align 8
   %conv = sitofp i32 %0 to double
   %add = fadd double %2, %conv
-  %arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv
-  store double %add, double* %arrayidx7, align 8
+  %arrayidx7 = getelementptr inbounds double, ptr %A, i64 %indvars.iv
+  store double %add, ptr %arrayidx7, align 8
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
@@ -1449,11 +1317,11 @@ for.end:                                          ; preds = %for.inc
   ret void
 }
 
- at a = common global [1 x i32*] zeroinitializer, align 8
- at c = common global i32* null, align 8
+ at a = common global [1 x ptr] zeroinitializer, align 8
+ at c = common global ptr null, align 8
 
 ; Reverse loop
-;void foo6(double *in, double *out, unsigned size, int *trigger) {
+;void foo6(ptr in, ptr out, unsigned size, int *trigger) {
 ;
 ;  for (int i=SIZE-1; i>=0; i--) {
 ;    if (trigger[i] > 0) {
@@ -1462,22 +1330,22 @@ for.end:                                          ; preds = %for.inc
 ;  }
 ;}
 
-define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %size, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
+define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
 ; AVX1-LABEL: @foo6(
 ; AVX1-NEXT:  entry:
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; AVX1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX1:       if.then:
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN:%.*]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
 ; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01
-; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double [[ADD]], ptr [[ARRAYIDX5]], align 8
 ; AVX1-NEXT:    br label [[FOR_INC]]
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
@@ -1488,24 +1356,18 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
 ;
 ; AVX2-LABEL: @foo6(
 ; AVX2-NEXT:  entry:
-; AVX2-NEXT:    [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8*
-; AVX2-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
-; AVX2-NEXT:    [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8*
 ; AVX2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX2:       vector.memcheck:
-; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT]], i64 4096
-; AVX2-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; AVX2-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 4096
-; AVX2-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; AVX2-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[IN]], i64 4096
-; AVX2-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP45]]
-; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
+; AVX2-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768
+; AVX2-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384
+; AVX2-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP1]]
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]]
 ; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; AVX2-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP78]]
-; AVX2-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[IN6]], [[SCEVGEP2]]
-; AVX2-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; AVX2-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP2]]
+; AVX2-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[UGLYGEP]]
+; AVX2-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; AVX2:       vector.ph:
 ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1516,93 +1378,81 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
 ; AVX2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4
 ; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -8
 ; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -12
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3
-; AVX2-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !17
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -3
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !17
 ; AVX2-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -4
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3
-; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !17
-; AVX2-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD12]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -3
-; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !17
-; AVX2-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD14]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -12
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3
-; AVX2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !17
-; AVX2-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD16]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
-; AVX2-NEXT:    [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE13]], zeroinitializer
-; AVX2-NEXT:    [[TMP22:%.*]] = icmp sgt <4 x i32> [[REVERSE15]], zeroinitializer
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt <4 x i32> [[REVERSE17]], zeroinitializer
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr double, double* [[TMP24]], i32 0
-; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -3
-; AVX2-NEXT:    [[REVERSE18:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !20
-; AVX2-NEXT:    [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -4
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -3
-; AVX2-NEXT:    [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !20
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -4
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 -3
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope !17
+; AVX2-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -8
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 -3
+; AVX2-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4, !alias.scope !17
+; AVX2-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -12
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 -3
+; AVX2-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope !17
+; AVX2-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
+; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE7]], zeroinitializer
+; AVX2-NEXT:    [[TMP18:%.*]] = icmp sgt <4 x i32> [[REVERSE9]], zeroinitializer
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp sgt <4 x i32> [[REVERSE11]], zeroinitializer
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP20]], i32 0
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP24]], i32 -3
+; AVX2-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP25]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope !20
+; AVX2-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP20]], i32 -4
+; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 -3
+; AVX2-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP27]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope !20
+; AVX2-NEXT:    [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[TMP20]], i32 -8
+; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -3
+; AVX2-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP18]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope !20
+; AVX2-NEXT:    [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -12
+; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -3
+; AVX2-NEXT:    [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP19]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !20
 ; AVX2-NEXT:    [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -8
-; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -3
-; AVX2-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !20
-; AVX2-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -12
-; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -3
-; AVX2-NEXT:    [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !20
-; AVX2-NEXT:    [[REVERSE28:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD27]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP40:%.*]] = fadd <4 x double> [[REVERSE19]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
-; AVX2-NEXT:    [[TMP41:%.*]] = fadd <4 x double> [[REVERSE22]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
-; AVX2-NEXT:    [[TMP42:%.*]] = fadd <4 x double> [[REVERSE25]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
-; AVX2-NEXT:    [[TMP43:%.*]] = fadd <4 x double> [[REVERSE28]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
-; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
-; AVX2-NEXT:    [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP40]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0
-; AVX2-NEXT:    [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -3
-; AVX2-NEXT:    [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !22, !noalias !24
-; AVX2-NEXT:    [[REVERSE31:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -4
-; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -3
-; AVX2-NEXT:    [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !22, !noalias !24
-; AVX2-NEXT:    [[REVERSE33:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -8
-; AVX2-NEXT:    [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -3
-; AVX2-NEXT:    [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !22, !noalias !24
-; AVX2-NEXT:    [[REVERSE35:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -12
-; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -3
-; AVX2-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !22, !noalias !24
+; AVX2-NEXT:    [[TMP32:%.*]] = fadd <4 x double> [[REVERSE13]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP33:%.*]] = fadd <4 x double> [[REVERSE16]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP34:%.*]] = fadd <4 x double> [[REVERSE19]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP35:%.*]] = fadd <4 x double> [[REVERSE22]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]]
+; AVX2-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x double> [[TMP32]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
+; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr double, ptr [[TMP40]], i32 -3
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE23]], ptr [[TMP41]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope !22, !noalias !24
+; AVX2-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x double> [[TMP33]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr double, ptr [[TMP36]], i32 -4
+; AVX2-NEXT:    [[TMP43:%.*]] = getelementptr double, ptr [[TMP42]], i32 -3
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE25]], ptr [[TMP43]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope !22, !noalias !24
+; AVX2-NEXT:    [[REVERSE27:%.*]] = shufflevector <4 x double> [[TMP34]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[TMP36]], i32 -8
+; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr double, ptr [[TMP44]], i32 -3
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE27]], ptr [[TMP45]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope !22, !noalias !24
+; AVX2-NEXT:    [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP35]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr double, ptr [[TMP36]], i32 -12
+; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr double, ptr [[TMP46]], i32 -3
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE29]], ptr [[TMP47]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !22, !noalias !24
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX2-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; AVX2-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; AVX2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1611,16 +1461,16 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP61:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP61]], 0
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP49:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP49]], 0
 ; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX2:       if.then:
-; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP62:%.*]] = load double, double* [[ARRAYIDX3]], align 8
-; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP62]], 5.000000e-01
-; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP50:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
+; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double [[ADD]], ptr [[ARRAYIDX5]], align 8
 ; AVX2-NEXT:    br label [[FOR_INC]]
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
@@ -1631,24 +1481,18 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
 ;
 ; AVX512-LABEL: @foo6(
 ; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8*
-; AVX512-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
-; AVX512-NEXT:    [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8*
 ; AVX512-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX512:       vector.memcheck:
-; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT]], i64 4096
-; AVX512-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; AVX512-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 4096
-; AVX512-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; AVX512-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[IN]], i64 4096
-; AVX512-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP45]]
-; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
+; AVX512-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768
+; AVX512-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384
+; AVX512-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP1]]
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]]
 ; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; AVX512-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP78]]
-; AVX512-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[IN6]], [[SCEVGEP2]]
-; AVX512-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; AVX512-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[UGLYGEP2]]
+; AVX512-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[UGLYGEP]]
+; AVX512-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; AVX512:       vector.ph:
 ; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1659,93 +1503,81 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
 ; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -8
 ; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -16
 ; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -24
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -7
-; AVX512-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -7
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope !31
 ; AVX512-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -7
-; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !31
-; AVX512-NEXT:    [[REVERSE13:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD12]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -16
-; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -7
-; AVX512-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !31
-; AVX512-NEXT:    [[REVERSE15:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD14]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -24
-; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -7
-; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !31
-; AVX512-NEXT:    [[REVERSE17:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD16]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP20:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
-; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE13]], zeroinitializer
-; AVX512-NEXT:    [[TMP22:%.*]] = icmp sgt <8 x i32> [[REVERSE15]], zeroinitializer
-; AVX512-NEXT:    [[TMP23:%.*]] = icmp sgt <8 x i32> [[REVERSE17]], zeroinitializer
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr double, double* [[TMP24]], i32 0
-; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -7
-; AVX512-NEXT:    [[REVERSE18:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> poison), !alias.scope !34
-; AVX512-NEXT:    [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -8
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -7
-; AVX512-NEXT:    [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -8
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 -7
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[REVERSE7:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD6]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -16
+; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 -7
+; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -24
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 -7
+; AVX512-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP15]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[REVERSE11:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD10]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE7]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = icmp sgt <8 x i32> [[REVERSE9]], zeroinitializer
+; AVX512-NEXT:    [[TMP19:%.*]] = icmp sgt <8 x i32> [[REVERSE11]], zeroinitializer
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP20]], i32 0
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP24]], i32 -7
+; AVX512-NEXT:    [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP25]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope !34
+; AVX512-NEXT:    [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP20]], i32 -8
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 -7
+; AVX512-NEXT:    [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP27]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope !34
+; AVX512-NEXT:    [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[TMP20]], i32 -16
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -7
+; AVX512-NEXT:    [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP18]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope !34
+; AVX512-NEXT:    [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -24
+; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -7
+; AVX512-NEXT:    [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP19]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34
 ; AVX512-NEXT:    [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -16
-; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -7
-; AVX512-NEXT:    [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> poison), !alias.scope !34
-; AVX512-NEXT:    [[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -24
-; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -7
-; AVX512-NEXT:    [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> poison), !alias.scope !34
-; AVX512-NEXT:    [[REVERSE28:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD27]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP40:%.*]] = fadd <8 x double> [[REVERSE19]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
-; AVX512-NEXT:    [[TMP41:%.*]] = fadd <8 x double> [[REVERSE22]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
-; AVX512-NEXT:    [[TMP42:%.*]] = fadd <8 x double> [[REVERSE25]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
-; AVX512-NEXT:    [[TMP43:%.*]] = fadd <8 x double> [[REVERSE28]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
-; AVX512-NEXT:    [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP40]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0
-; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -7
-; AVX512-NEXT:    [[TMP50:%.*]] = bitcast double* [[TMP49]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !36, !noalias !38
-; AVX512-NEXT:    [[REVERSE31:%.*]] = shufflevector <8 x double> [[TMP41]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -8
-; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -7
-; AVX512-NEXT:    [[TMP53:%.*]] = bitcast double* [[TMP52]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !36, !noalias !38
-; AVX512-NEXT:    [[REVERSE33:%.*]] = shufflevector <8 x double> [[TMP42]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -16
-; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -7
-; AVX512-NEXT:    [[TMP56:%.*]] = bitcast double* [[TMP55]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !36, !noalias !38
-; AVX512-NEXT:    [[REVERSE35:%.*]] = shufflevector <8 x double> [[TMP43]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -24
-; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -7
-; AVX512-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[TMP32:%.*]] = fadd <8 x double> [[REVERSE13]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <8 x double> [[REVERSE16]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = fadd <8 x double> [[REVERSE19]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP35:%.*]] = fadd <8 x double> [[REVERSE22]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]]
+; AVX512-NEXT:    [[REVERSE23:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
+; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr double, ptr [[TMP40]], i32 -7
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE23]], ptr [[TMP41]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[REVERSE25:%.*]] = shufflevector <8 x double> [[TMP33]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr double, ptr [[TMP36]], i32 -8
+; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr double, ptr [[TMP42]], i32 -7
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE25]], ptr [[TMP43]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[REVERSE27:%.*]] = shufflevector <8 x double> [[TMP34]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[TMP36]], i32 -16
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr double, ptr [[TMP44]], i32 -7
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE27]], ptr [[TMP45]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP35]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr double, ptr [[TMP36]], i32 -24
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr double, ptr [[TMP46]], i32 -7
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE29]], ptr [[TMP47]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !36, !noalias !38
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; AVX512-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
+; AVX512-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1754,16 +1586,16 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP61:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP61]], 0
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP49:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP49]], 0
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
-; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP62:%.*]] = load double, double* [[ARRAYIDX3]], align 8
-; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP62]], 5.000000e-01
-; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP50:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], ptr [[ARRAYIDX5]], align 8
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
@@ -1777,17 +1609,17 @@ entry:
 
 for.body:                                         ; preds = %for.inc, %entry
   %indvars.iv = phi i64 [ 4095, %entry ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
   %cmp1 = icmp sgt i32 %0, 0
   br i1 %cmp1, label %if.then, label %for.inc
 
 if.then:                                          ; preds = %for.body
-  %arrayidx3 = getelementptr inbounds double, double* %in, i64 %indvars.iv
-  %1 = load double, double* %arrayidx3, align 8
+  %arrayidx3 = getelementptr inbounds double, ptr %in, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx3, align 8
   %add = fadd double %1, 5.000000e-01
-  %arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv
-  store double %add, double* %arrayidx5, align 8
+  %arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv
+  store double %add, ptr %arrayidx5, align 8
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
@@ -1799,7 +1631,7 @@ for.end:                                          ; preds = %for.inc
   ret void
 }
 
-; void foo7 (double * __restrict__  out, double ** __restrict__  in,
+; void foo7 (ptr __restrict__  out, ptr __restrict__  in,
 ;           bool * __restrict__ trigger, unsigned size) {
 ;
 ;  for (unsigned i=0; i<size; i++)
@@ -1807,7 +1639,7 @@ for.end:                                          ; preds = %for.inc
 ;      out[i] = (double) 0.5;
 ; }
 
-define void @foo7(double* noalias nocapture %out, double** noalias nocapture readonly %in, i8* noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
+define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in, ptr noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
 ; AVX1-LABEL: @foo7(
 ; AVX1-NEXT:  entry:
 ; AVX1-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
@@ -1826,81 +1658,69 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
-; AVX1-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
-; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
-; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
-; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
-; AVX1-NEXT:    [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
-; AVX1-NEXT:    [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
-; AVX1-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
-; AVX1-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0
-; AVX1-NEXT:    [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> poison)
-; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 4
-; AVX1-NEXT:    [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> poison)
-; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 8
-; AVX1-NEXT:    [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> poison)
-; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 12
-; AVX1-NEXT:    [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> poison)
-; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX1-NEXT:    [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer
-; AVX1-NEXT:    [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer
-; AVX1-NEXT:    [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer
-; AVX1-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
-; AVX1-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
-; AVX1-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
-; AVX1-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
-; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
-; AVX1-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
-; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
-; AVX1-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
-; AVX1-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
-; AVX1-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
-; AVX1-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12
-; AVX1-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
+; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; AVX1-NEXT:    [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; AVX1-NEXT:    [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
+; AVX1-NEXT:    [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
+; AVX1-NEXT:    [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer
+; AVX1-NEXT:    [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer
+; AVX1-NEXT:    [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer
+; AVX1-NEXT:    [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer
+; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]]
+; AVX1-NEXT:    [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
+; AVX1-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
+; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
+; AVX1-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison)
+; AVX1-NEXT:    [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX1-NEXT:    [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
+; AVX1-NEXT:    [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
+; AVX1-NEXT:    [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
+; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]]
+; AVX1-NEXT:    [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
+; AVX1-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
+; AVX1-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
+; AVX1-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX1-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; AVX1-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX1-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -1909,19 +1729,19 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX1-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
-; AVX1-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; AVX1-NEXT:    [[TMP54:%.*]] = and i8 [[TMP53]], 1
+; AVX1-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0
 ; AVX1-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX1:       land.lhs.true:
-; AVX1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
-; AVX1-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP67]], null
+; AVX1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
+; AVX1-NEXT:    [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null
 ; AVX1-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX1:       if.then:
-; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
 ; AVX1-NEXT:    br label [[FOR_INC]]
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -1950,81 +1770,69 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
 ; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
-; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
-; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
-; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
-; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
-; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
-; AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
-; AVX2-NEXT:    [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; AVX2-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
-; AVX2-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
-; AVX2-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
-; AVX2-NEXT:    [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
-; AVX2-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0
-; AVX2-NEXT:    [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> poison)
-; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 4
-; AVX2-NEXT:    [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> poison)
-; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 8
-; AVX2-NEXT:    [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> poison)
-; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 12
-; AVX2-NEXT:    [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> poison)
-; AVX2-NEXT:    [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX2-NEXT:    [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer
-; AVX2-NEXT:    [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer
-; AVX2-NEXT:    [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer
-; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
-; AVX2-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
-; AVX2-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
-; AVX2-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
-; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
-; AVX2-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
-; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
-; AVX2-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
-; AVX2-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
-; AVX2-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
-; AVX2-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12
-; AVX2-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
+; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; AVX2-NEXT:    [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; AVX2-NEXT:    [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
+; AVX2-NEXT:    [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
+; AVX2-NEXT:    [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer
+; AVX2-NEXT:    [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer
+; AVX2-NEXT:    [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
+; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
+; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison)
+; AVX2-NEXT:    [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX2-NEXT:    [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
+; AVX2-NEXT:    [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
+; AVX2-NEXT:    [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
+; AVX2-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
+; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
+; AVX2-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; AVX2-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX2-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2033,19 +1841,19 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX2-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
-; AVX2-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; AVX2-NEXT:    [[TMP54:%.*]] = and i8 [[TMP53]], 1
+; AVX2-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0
 ; AVX2-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX2:       land.lhs.true:
-; AVX2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
-; AVX2-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP67]], null
+; AVX2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
+; AVX2-NEXT:    [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null
 ; AVX2-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX2:       if.then:
-; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
 ; AVX2-NEXT:    br label [[FOR_INC]]
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -2074,81 +1882,69 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <8 x i8>*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 1
-; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
-; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
-; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 16
-; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
-; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 24
-; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>*
-; AVX512-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1
-; AVX512-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP16]], zeroinitializer
-; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer
-; AVX512-NEXT:    [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer
-; AVX512-NEXT:    [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP28:%.*]] = xor <8 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP30:%.*]] = xor <8 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double** [[TMP32]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x double*> poison)
-; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 8
-; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double** [[TMP34]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x double*> poison)
-; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 16
-; AVX512-NEXT:    [[TMP37:%.*]] = bitcast double** [[TMP36]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x double*> poison)
-; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 24
-; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double** [[TMP38]] to <8 x double*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x double*> poison)
-; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX512-NEXT:    [[TMP41:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer
-; AVX512-NEXT:    [[TMP42:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer
-; AVX512-NEXT:    [[TMP43:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer
-; AVX512-NEXT:    [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer
-; AVX512-NEXT:    [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
-; AVX512-NEXT:    [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer
-; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
-; AVX512-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]])
-; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 8
-; AVX512-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]])
-; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 16
-; AVX512-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]])
-; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 24
-; AVX512-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
+; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 24
+; AVX512-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1
+; AVX512-NEXT:    [[TMP12:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP13:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP14:%.*]] = and <8 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP15:%.*]] = and <8 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp eq <8 x i8> [[TMP12]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp eq <8 x i8> [[TMP13]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = icmp eq <8 x i8> [[TMP14]], zeroinitializer
+; AVX512-NEXT:    [[TMP19:%.*]] = icmp eq <8 x i8> [[TMP15]], zeroinitializer
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP24:%.*]] = xor <8 x i1> [[TMP16]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP25:%.*]] = xor <8 x i1> [[TMP17]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP26:%.*]] = xor <8 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP27:%.*]] = xor <8 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison)
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison)
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 16
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison)
+; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 24
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP27]], <8 x ptr> poison)
+; AVX512-NEXT:    [[TMP32:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
+; AVX512-NEXT:    [[TMP34:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
+; AVX512-NEXT:    [[TMP35:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP40:%.*]] = xor <8 x i1> [[TMP32]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP41:%.*]] = xor <8 x i1> [[TMP33]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP42:%.*]] = xor <8 x i1> [[TMP34]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP43:%.*]] = xor <8 x i1> [[TMP35]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP44:%.*]] = select <8 x i1> [[TMP24]], <8 x i1> [[TMP40]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP45:%.*]] = select <8 x i1> [[TMP25]], <8 x i1> [[TMP41]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP46:%.*]] = select <8 x i1> [[TMP26]], <8 x i1> [[TMP42]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = select <8 x i1> [[TMP27]], <8 x i1> [[TMP43]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]])
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 16
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 24
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <8 x i1> [[TMP47]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
+; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2157,19 +1953,19 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX512-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
-; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP54:%.*]] = and i8 [[TMP53]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0
 ; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX512:       land.lhs.true:
-; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
-; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP67]], null
+; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null
 ; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX512:       if.then:
-; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -2190,21 +1986,21 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.inc, %for.body.preheader
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, i8* %trigger, i64 %indvars.iv
-  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i8, ptr %trigger, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
   %1 = and i8 %0, 1
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %for.inc, label %land.lhs.true
 
 land.lhs.true:                                    ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds double*, double** %in, i64 %indvars.iv
-  %2 = load double*, double** %arrayidx2, align 8
-  %cmp3 = icmp eq double* %2, null
+  %arrayidx2 = getelementptr inbounds ptr, ptr %in, i64 %indvars.iv
+  %2 = load ptr, ptr %arrayidx2, align 8
+  %cmp3 = icmp eq ptr %2, null
   br i1 %cmp3, label %for.inc, label %if.then
 
 if.then:                                          ; preds = %land.lhs.true
-  %arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv
-  store double 5.000000e-01, double* %arrayidx5, align 8
+  %arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv
+  store double 5.000000e-01, ptr %arrayidx5, align 8
   br label %for.inc
 
 for.inc:                                          ; preds = %land.lhs.true, %for.body, %if.then
@@ -2217,14 +2013,14 @@ for.end:                                          ; preds = %for.inc, %entry
 }
 
 ;typedef int (*fp)();
-;void foo8 (double* __restrict__  out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) {
+;void foo8 (ptr __restrict__  out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) {
 ;
 ;  for (unsigned i=0; i<size; i++)
 ;    if (trigger[i] && (in[i] != 0))
 ;      out[i] = (double) 0.5;
 ;}
 
-define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture readonly %in, i8* noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
+define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in, ptr noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
 ; AVX1-LABEL: @foo8(
 ; AVX1-NEXT:  entry:
 ; AVX1-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
@@ -2243,81 +2039,69 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
-; AVX1-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
-; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
-; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
-; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
-; AVX1-NEXT:    [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
-; AVX1-NEXT:    [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
-; AVX1-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
-; AVX1-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0
-; AVX1-NEXT:    [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> poison)
-; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 4
-; AVX1-NEXT:    [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> poison)
-; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8
-; AVX1-NEXT:    [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> poison)
-; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 12
-; AVX1-NEXT:    [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> poison)
-; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX1-NEXT:    [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer
-; AVX1-NEXT:    [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer
-; AVX1-NEXT:    [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer
-; AVX1-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
-; AVX1-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
-; AVX1-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
-; AVX1-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
-; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
-; AVX1-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
-; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
-; AVX1-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
-; AVX1-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
-; AVX1-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
-; AVX1-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12
-; AVX1-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
+; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; AVX1-NEXT:    [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; AVX1-NEXT:    [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
+; AVX1-NEXT:    [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
+; AVX1-NEXT:    [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer
+; AVX1-NEXT:    [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer
+; AVX1-NEXT:    [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer
+; AVX1-NEXT:    [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer
+; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]]
+; AVX1-NEXT:    [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
+; AVX1-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
+; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
+; AVX1-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison)
+; AVX1-NEXT:    [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX1-NEXT:    [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
+; AVX1-NEXT:    [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
+; AVX1-NEXT:    [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
+; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]]
+; AVX1-NEXT:    [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
+; AVX1-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
+; AVX1-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
+; AVX1-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX1-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; AVX1-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX1-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2326,19 +2110,19 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX1-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
-; AVX1-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; AVX1-NEXT:    [[TMP54:%.*]] = and i8 [[TMP53]], 1
+; AVX1-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0
 ; AVX1-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX1:       land.lhs.true:
-; AVX1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
-; AVX1-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null
+; AVX1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
+; AVX1-NEXT:    [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null
 ; AVX1-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX1:       if.then:
-; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
 ; AVX1-NEXT:    br label [[FOR_INC]]
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -2367,81 +2151,69 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
 ; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
-; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
-; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
-; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
-; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
-; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
-; AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
-; AVX2-NEXT:    [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; AVX2-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
-; AVX2-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
-; AVX2-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
-; AVX2-NEXT:    [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
-; AVX2-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0
-; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> poison)
-; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 4
-; AVX2-NEXT:    [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> poison)
-; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8
-; AVX2-NEXT:    [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> poison)
-; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 12
-; AVX2-NEXT:    [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> poison)
-; AVX2-NEXT:    [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX2-NEXT:    [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer
-; AVX2-NEXT:    [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer
-; AVX2-NEXT:    [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer
-; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
-; AVX2-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
-; AVX2-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
-; AVX2-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
-; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
-; AVX2-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
-; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
-; AVX2-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
-; AVX2-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
-; AVX2-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
-; AVX2-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12
-; AVX2-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
+; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; AVX2-NEXT:    [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; AVX2-NEXT:    [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
+; AVX2-NEXT:    [[TMP14:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
+; AVX2-NEXT:    [[TMP15:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp eq <4 x i8> [[TMP12]], zeroinitializer
+; AVX2-NEXT:    [[TMP17:%.*]] = icmp eq <4 x i8> [[TMP13]], zeroinitializer
+; AVX2-NEXT:    [[TMP18:%.*]] = icmp eq <4 x i8> [[TMP14]], zeroinitializer
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp eq <4 x i8> [[TMP15]], zeroinitializer
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP24:%.*]] = xor <4 x i1> [[TMP16]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP25:%.*]] = xor <4 x i1> [[TMP17]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
+; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
+; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison)
+; AVX2-NEXT:    [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX2-NEXT:    [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
+; AVX2-NEXT:    [[TMP34:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
+; AVX2-NEXT:    [[TMP35:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP40:%.*]] = xor <4 x i1> [[TMP32]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP41:%.*]] = xor <4 x i1> [[TMP33]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP42:%.*]] = xor <4 x i1> [[TMP34]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP43:%.*]] = xor <4 x i1> [[TMP35]], <i1 true, i1 true, i1 true, i1 true>
+; AVX2-NEXT:    [[TMP44:%.*]] = select <4 x i1> [[TMP24]], <4 x i1> [[TMP40]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP45:%.*]] = select <4 x i1> [[TMP25]], <4 x i1> [[TMP41]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
+; AVX2-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
+; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
+; AVX2-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; AVX2-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX2-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2450,19 +2222,19 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX2-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
-; AVX2-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; AVX2-NEXT:    [[TMP54:%.*]] = and i8 [[TMP53]], 1
+; AVX2-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0
 ; AVX2-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX2:       land.lhs.true:
-; AVX2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
-; AVX2-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null
+; AVX2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
+; AVX2-NEXT:    [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null
 ; AVX2-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX2:       if.then:
-; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
 ; AVX2-NEXT:    br label [[FOR_INC]]
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -2491,81 +2263,69 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
 ; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <8 x i8>*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 1
-; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
-; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
-; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 16
-; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
-; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 24
-; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>*
-; AVX512-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1
-; AVX512-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AVX512-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP16]], zeroinitializer
-; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer
-; AVX512-NEXT:    [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer
-; AVX512-NEXT:    [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP28:%.*]] = xor <8 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP30:%.*]] = xor <8 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x i32 ()*> poison)
-; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8
-; AVX512-NEXT:    [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x i32 ()*> poison)
-; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 16
-; AVX512-NEXT:    [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x i32 ()*> poison)
-; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 24
-; AVX512-NEXT:    [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <8 x i32 ()*>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x i32 ()*> poison)
-; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX512-NEXT:    [[TMP41:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer
-; AVX512-NEXT:    [[TMP42:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer
-; AVX512-NEXT:    [[TMP43:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
-; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer
-; AVX512-NEXT:    [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer
-; AVX512-NEXT:    [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
-; AVX512-NEXT:    [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer
-; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
-; AVX512-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]])
-; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 8
-; AVX512-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]])
-; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 16
-; AVX512-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]])
-; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 24
-; AVX512-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
+; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 24
+; AVX512-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1
+; AVX512-NEXT:    [[TMP12:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP13:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP14:%.*]] = and <8 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP15:%.*]] = and <8 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp eq <8 x i8> [[TMP12]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp eq <8 x i8> [[TMP13]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = icmp eq <8 x i8> [[TMP14]], zeroinitializer
+; AVX512-NEXT:    [[TMP19:%.*]] = icmp eq <8 x i8> [[TMP15]], zeroinitializer
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP24:%.*]] = xor <8 x i1> [[TMP16]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP25:%.*]] = xor <8 x i1> [[TMP17]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP26:%.*]] = xor <8 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP27:%.*]] = xor <8 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison)
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison)
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 16
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison)
+; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 24
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP27]], <8 x ptr> poison)
+; AVX512-NEXT:    [[TMP32:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
+; AVX512-NEXT:    [[TMP34:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
+; AVX512-NEXT:    [[TMP35:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]]
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP40:%.*]] = xor <8 x i1> [[TMP32]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP41:%.*]] = xor <8 x i1> [[TMP33]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP42:%.*]] = xor <8 x i1> [[TMP34]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP43:%.*]] = xor <8 x i1> [[TMP35]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; AVX512-NEXT:    [[TMP44:%.*]] = select <8 x i1> [[TMP24]], <8 x i1> [[TMP40]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP45:%.*]] = select <8 x i1> [[TMP25]], <8 x i1> [[TMP41]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP46:%.*]] = select <8 x i1> [[TMP26]], <8 x i1> [[TMP42]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = select <8 x i1> [[TMP27]], <8 x i1> [[TMP43]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]])
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 16
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 24
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <8 x i1> [[TMP47]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
+; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2574,19 +2334,19 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX512-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
-; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP54:%.*]] = and i8 [[TMP53]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP54]], 0
 ; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX512:       land.lhs.true:
-; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
-; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null
+; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq ptr [[TMP55]], null
 ; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX512:       if.then:
-; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -2607,21 +2367,21 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.inc, %for.body.preheader
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx = getelementptr inbounds i8, i8* %trigger, i64 %indvars.iv
-  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i8, ptr %trigger, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
   %1 = and i8 %0, 1
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %for.inc, label %land.lhs.true
 
 land.lhs.true:                                    ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32 ()*, i32 ()** %in, i64 %indvars.iv
-  %2 = load i32 ()*, i32 ()** %arrayidx2, align 8
-  %cmp3 = icmp eq i32 ()* %2, null
+  %arrayidx2 = getelementptr inbounds ptr, ptr %in, i64 %indvars.iv
+  %2 = load ptr, ptr %arrayidx2, align 8
+  %cmp3 = icmp eq ptr %2, null
   br i1 %cmp3, label %for.inc, label %if.then
 
 if.then:                                          ; preds = %land.lhs.true
-  %arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv
-  store double 5.000000e-01, double* %arrayidx5, align 8
+  %arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv
+  store double 5.000000e-01, ptr %arrayidx5, align 8
   br label %for.inc
 
 for.inc:                                          ; preds = %land.lhs.true, %for.body, %if.then

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index 5a4bda6680e5..6a4ed2cd056a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -16,1580 +16,1164 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i32 @enabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %N) {
 ; O1-LABEL: @enabled(
 ; O1-NEXT:  entry:
 ; O1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; O1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; O1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; O1-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; O1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; O1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; O1-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; O1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; O1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; O1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; O1-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; O1-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; O1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; O1-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O1-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; O1-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; O1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; O1-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; O1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; O1-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; O1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; O1-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; O1-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; O1-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; O1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; O1-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; O1-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; O1-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; O1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; O1-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O1-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; O1-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; O1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; O1-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; O1-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; O1-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; O1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; O1-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; O1-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; O1-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; O1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; O1-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; O1-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; O1-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; O1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; O1-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; O1-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; O1-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; O1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; O1-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; O1-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; O1-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; O1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; O1-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; O1-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; O1-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; O1-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; O1-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; O1-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; O1-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; O1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; O1-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; O1-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; O1-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; O1-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; O1-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; O1-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; O1-NEXT:    ret i32 [[TMP78]]
+; O1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; O1-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; O1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; O1-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O1-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; O1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O1-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; O1-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O1-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; O1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O1-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; O1-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O1-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; O1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O1-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; O1-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O1-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; O1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O1-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; O1-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O1-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; O1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O1-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; O1-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O1-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; O1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O1-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; O1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O1-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; O1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O1-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; O1-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O1-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; O1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O1-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; O1-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O1-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; O1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O1-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; O1-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O1-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; O1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O1-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; O1-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O1-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; O1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O1-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; O1-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O1-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; O1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O1-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; O1-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O1-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; O1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O1-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; O1-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O1-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; O1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O1-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; O1-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; O1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O1-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; O1-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; O1-NEXT:    ret i32 [[TMP46]]
 ;
 ; O2-LABEL: @enabled(
 ; O2-NEXT:  entry:
 ; O2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; O2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; O2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; O2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; O2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; O2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; O2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; O2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; O2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; O2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; O2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; O2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; O2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; O2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; O2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; O2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; O2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; O2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; O2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; O2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; O2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; O2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; O2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; O2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; O2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; O2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; O2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; O2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; O2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; O2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; O2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; O2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; O2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; O2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; O2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; O2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; O2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; O2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; O2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; O2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; O2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; O2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; O2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; O2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; O2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; O2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; O2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; O2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; O2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; O2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; O2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; O2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; O2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; O2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; O2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; O2-NEXT:    ret i32 [[TMP78]]
+; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; O2-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; O2-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O2-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; O2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O2-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; O2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; O2-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O2-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; O2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; O2-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O2-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; O2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; O2-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O2-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; O2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O2-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; O2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; O2-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O2-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; O2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; O2-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O2-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; O2-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O2-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; O2-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O2-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; O2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O2-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; O2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; O2-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O2-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; O2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; O2-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O2-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; O2-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O2-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; O2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP46]]
 ;
 ; O3-LABEL: @enabled(
 ; O3-NEXT:  entry:
 ; O3-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; O3-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; O3-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; O3-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; O3-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; O3-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; O3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; O3-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; O3-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; O3-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; O3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O3-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; O3-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; O3-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; O3-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; O3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; O3-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; O3-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; O3-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; O3-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; O3-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; O3-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; O3-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; O3-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; O3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; O3-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; O3-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; O3-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; O3-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; O3-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; O3-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; O3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; O3-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; O3-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; O3-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; O3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; O3-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; O3-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; O3-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; O3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; O3-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; O3-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; O3-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; O3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; O3-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; O3-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; O3-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; O3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; O3-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; O3-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; O3-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; O3-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; O3-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; O3-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; O3-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; O3-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; O3-NEXT:    ret i32 [[TMP78]]
+; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; O3-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; O3-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; O3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; O3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; O3-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; O3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; O3-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; O3-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; O3-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; O3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; O3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; O3-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; O3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; O3-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; O3-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; O3-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; O3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O3-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; O3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; O3-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O3-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; O3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; O3-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O3-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; O3-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O3-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; O3-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP46]]
 ;
 ; O3DEFAULT-LABEL: @enabled(
 ; O3DEFAULT-NEXT:  entry:
 ; O3DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; O3DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; O3DEFAULT-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; O3DEFAULT-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; O3DEFAULT-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; O3DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; O3DEFAULT-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; O3DEFAULT-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; O3DEFAULT-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; O3DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; O3DEFAULT-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; O3DEFAULT-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; O3DEFAULT-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; O3DEFAULT-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; O3DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; O3DEFAULT-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; O3DEFAULT-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; O3DEFAULT-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; O3DEFAULT-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; O3DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; O3DEFAULT-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; O3DEFAULT-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; O3DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; O3DEFAULT-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; O3DEFAULT-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; O3DEFAULT-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; O3DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; O3DEFAULT-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; O3DEFAULT-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; O3DEFAULT-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; O3DEFAULT-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; O3DEFAULT-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; O3DEFAULT-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; O3DEFAULT-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; O3DEFAULT-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; O3DEFAULT-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; O3DEFAULT-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; O3DEFAULT-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; O3DEFAULT-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; O3DEFAULT-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; O3DEFAULT-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; O3DEFAULT-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; O3DEFAULT-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; O3DEFAULT-NEXT:    ret i32 [[TMP78]]
+; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP46]]
 ;
 ; Os-LABEL: @enabled(
 ; Os-NEXT:  entry:
 ; Os-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; Os-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; Os-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; Os-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; Os-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; Os-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; Os-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; Os-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; Os-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; Os-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; Os-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; Os-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; Os-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; Os-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; Os-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; Os-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; Os-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; Os-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; Os-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; Os-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; Os-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; Os-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; Os-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; Os-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; Os-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; Os-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; Os-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; Os-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; Os-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; Os-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; Os-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; Os-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; Os-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; Os-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; Os-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; Os-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; Os-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; Os-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; Os-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; Os-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; Os-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; Os-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; Os-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; Os-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; Os-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; Os-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; Os-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; Os-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; Os-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; Os-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; Os-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; Os-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; Os-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; Os-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; Os-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; Os-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; Os-NEXT:    ret i32 [[TMP78]]
+; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; Os-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; Os-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; Os-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; Os-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; Os-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; Os-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; Os-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; Os-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; Os-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; Os-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; Os-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; Os-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; Os-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; Os-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; Os-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; Os-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; Os-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; Os-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; Os-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; Os-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; Os-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; Os-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; Os-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; Os-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; Os-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; Os-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; Os-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; Os-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; Os-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; Os-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; Os-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; Os-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; Os-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; Os-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; Os-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; Os-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; Os-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; Os-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; Os-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP46]]
 ;
 ; Oz-LABEL: @enabled(
 ; Oz-NEXT:  entry:
 ; Oz-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; Oz-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; Oz-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; Oz-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; Oz-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; Oz-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; Oz-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; Oz-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; Oz-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; Oz-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; Oz-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; Oz-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; Oz-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; Oz-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; Oz-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; Oz-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; Oz-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; Oz-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; Oz-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; Oz-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; Oz-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; Oz-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; Oz-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; Oz-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; Oz-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; Oz-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; Oz-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; Oz-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; Oz-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; Oz-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; Oz-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; Oz-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; Oz-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; Oz-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; Oz-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; Oz-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; Oz-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; Oz-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; Oz-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; Oz-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; Oz-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; Oz-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; Oz-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; Oz-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; Oz-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; Oz-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; Oz-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; Oz-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; Oz-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; Oz-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; Oz-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; Oz-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; Oz-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; Oz-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; Oz-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; Oz-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; Oz-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; Oz-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; Oz-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; Oz-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; Oz-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; Oz-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; Oz-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; Oz-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; Oz-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; Oz-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; Oz-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; Oz-NEXT:    ret i32 [[TMP78]]
+; Oz-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; Oz-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; Oz-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; Oz-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; Oz-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; Oz-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; Oz-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; Oz-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; Oz-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; Oz-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; Oz-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; Oz-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; Oz-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; Oz-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; Oz-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; Oz-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; Oz-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; Oz-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; Oz-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; Oz-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; Oz-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; Oz-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; Oz-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; Oz-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; Oz-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; Oz-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; Oz-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; Oz-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; Oz-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; Oz-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; Oz-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; Oz-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; Oz-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; Oz-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; Oz-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; Oz-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; Oz-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; Oz-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; Oz-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; Oz-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; Oz-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; Oz-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; Oz-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; Oz-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; Oz-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; Oz-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; Oz-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; Oz-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; Oz-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; Oz-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; Oz-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; Oz-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; Oz-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; Oz-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; Oz-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; Oz-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; Oz-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; Oz-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; Oz-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; Oz-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; Oz-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; Oz-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; Oz-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; Oz-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; Oz-NEXT:    ret i32 [[TMP46]]
 ;
 ; O1VEC2-LABEL: @enabled(
 ; O1VEC2-NEXT:  entry:
 ; O1VEC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; O1VEC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; O1VEC2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; O1VEC2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; O1VEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; O1VEC2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; O1VEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; O1VEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; O1VEC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; O1VEC2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; O1VEC2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; O1VEC2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; O1VEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; O1VEC2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O1VEC2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; O1VEC2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; O1VEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; O1VEC2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; O1VEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; O1VEC2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; O1VEC2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; O1VEC2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; O1VEC2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; O1VEC2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; O1VEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; O1VEC2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; O1VEC2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; O1VEC2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; O1VEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; O1VEC2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O1VEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; O1VEC2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; O1VEC2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; O1VEC2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; O1VEC2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; O1VEC2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; O1VEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; O1VEC2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; O1VEC2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; O1VEC2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; O1VEC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; O1VEC2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; O1VEC2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; O1VEC2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; O1VEC2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; O1VEC2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; O1VEC2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; O1VEC2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; O1VEC2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; O1VEC2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; O1VEC2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; O1VEC2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; O1VEC2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; O1VEC2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; O1VEC2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; O1VEC2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; O1VEC2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; O1VEC2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; O1VEC2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; O1VEC2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; O1VEC2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; O1VEC2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; O1VEC2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; O1VEC2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; O1VEC2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; O1VEC2-NEXT:    ret i32 [[TMP78]]
+; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; O1VEC2-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; O1VEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O1VEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; O1VEC2-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O1VEC2-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; O1VEC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O1VEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; O1VEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O1VEC2-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; O1VEC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O1VEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; O1VEC2-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O1VEC2-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; O1VEC2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O1VEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; O1VEC2-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O1VEC2-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; O1VEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O1VEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; O1VEC2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O1VEC2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; O1VEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O1VEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; O1VEC2-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O1VEC2-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; O1VEC2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O1VEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; O1VEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O1VEC2-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; O1VEC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O1VEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; O1VEC2-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O1VEC2-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; O1VEC2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O1VEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; O1VEC2-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O1VEC2-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; O1VEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O1VEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; O1VEC2-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O1VEC2-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; O1VEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O1VEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; O1VEC2-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O1VEC2-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; O1VEC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O1VEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; O1VEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O1VEC2-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; O1VEC2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O1VEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; O1VEC2-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O1VEC2-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; O1VEC2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O1VEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; O1VEC2-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O1VEC2-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; O1VEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O1VEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; O1VEC2-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O1VEC2-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; O1VEC2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP46]]
 ;
 ; OzVEC2-LABEL: @enabled(
 ; OzVEC2-NEXT:  entry:
 ; OzVEC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; OzVEC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; OzVEC2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; OzVEC2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; OzVEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; OzVEC2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; OzVEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; OzVEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; OzVEC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; OzVEC2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; OzVEC2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; OzVEC2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; OzVEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; OzVEC2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; OzVEC2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; OzVEC2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; OzVEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; OzVEC2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; OzVEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; OzVEC2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; OzVEC2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; OzVEC2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; OzVEC2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; OzVEC2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; OzVEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; OzVEC2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; OzVEC2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; OzVEC2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; OzVEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; OzVEC2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; OzVEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; OzVEC2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; OzVEC2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; OzVEC2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; OzVEC2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; OzVEC2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; OzVEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; OzVEC2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; OzVEC2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; OzVEC2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; OzVEC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; OzVEC2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; OzVEC2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; OzVEC2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; OzVEC2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; OzVEC2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; OzVEC2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; OzVEC2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; OzVEC2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; OzVEC2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; OzVEC2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; OzVEC2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; OzVEC2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; OzVEC2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; OzVEC2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; OzVEC2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; OzVEC2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; OzVEC2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; OzVEC2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; OzVEC2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; OzVEC2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; OzVEC2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; OzVEC2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; OzVEC2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; OzVEC2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; OzVEC2-NEXT:    ret i32 [[TMP78]]
+; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; OzVEC2-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; OzVEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; OzVEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; OzVEC2-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; OzVEC2-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; OzVEC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; OzVEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; OzVEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; OzVEC2-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; OzVEC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; OzVEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; OzVEC2-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; OzVEC2-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; OzVEC2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; OzVEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; OzVEC2-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; OzVEC2-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; OzVEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; OzVEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; OzVEC2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; OzVEC2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; OzVEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; OzVEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; OzVEC2-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; OzVEC2-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; OzVEC2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; OzVEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; OzVEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; OzVEC2-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; OzVEC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; OzVEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; OzVEC2-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; OzVEC2-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; OzVEC2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; OzVEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; OzVEC2-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; OzVEC2-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; OzVEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; OzVEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; OzVEC2-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; OzVEC2-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; OzVEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; OzVEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; OzVEC2-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; OzVEC2-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; OzVEC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; OzVEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; OzVEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; OzVEC2-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; OzVEC2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; OzVEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; OzVEC2-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; OzVEC2-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; OzVEC2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; OzVEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; OzVEC2-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; OzVEC2-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; OzVEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; OzVEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; OzVEC2-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; OzVEC2-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; OzVEC2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP46]]
 ;
 ; O3DIS-LABEL: @enabled(
 ; O3DIS-NEXT:  entry:
 ; O3DIS-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; O3DIS-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; O3DIS-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; O3DIS-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; O3DIS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; O3DIS-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; O3DIS-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; O3DIS-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; O3DIS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; O3DIS-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; O3DIS-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; O3DIS-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; O3DIS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; O3DIS-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O3DIS-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; O3DIS-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; O3DIS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; O3DIS-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; O3DIS-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; O3DIS-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; O3DIS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; O3DIS-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; O3DIS-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; O3DIS-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; O3DIS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; O3DIS-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; O3DIS-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; O3DIS-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; O3DIS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; O3DIS-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O3DIS-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; O3DIS-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; O3DIS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; O3DIS-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; O3DIS-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; O3DIS-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; O3DIS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; O3DIS-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; O3DIS-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; O3DIS-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; O3DIS-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; O3DIS-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; O3DIS-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; O3DIS-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; O3DIS-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; O3DIS-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; O3DIS-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; O3DIS-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; O3DIS-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; O3DIS-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; O3DIS-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; O3DIS-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; O3DIS-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; O3DIS-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; O3DIS-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; O3DIS-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; O3DIS-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; O3DIS-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; O3DIS-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; O3DIS-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; O3DIS-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; O3DIS-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; O3DIS-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; O3DIS-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; O3DIS-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; O3DIS-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; O3DIS-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; O3DIS-NEXT:    ret i32 [[TMP78]]
+; O3DIS-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; O3DIS-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; O3DIS-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3DIS-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; O3DIS-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3DIS-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; O3DIS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3DIS-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; O3DIS-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3DIS-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; O3DIS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3DIS-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; O3DIS-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3DIS-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; O3DIS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3DIS-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; O3DIS-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3DIS-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; O3DIS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3DIS-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; O3DIS-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3DIS-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; O3DIS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3DIS-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; O3DIS-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3DIS-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; O3DIS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3DIS-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; O3DIS-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3DIS-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; O3DIS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3DIS-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; O3DIS-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3DIS-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; O3DIS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3DIS-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; O3DIS-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3DIS-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; O3DIS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3DIS-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; O3DIS-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3DIS-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; O3DIS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3DIS-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; O3DIS-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3DIS-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; O3DIS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O3DIS-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; O3DIS-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O3DIS-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; O3DIS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O3DIS-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; O3DIS-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O3DIS-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; O3DIS-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O3DIS-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; O3DIS-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O3DIS-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; O3DIS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O3DIS-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; O3DIS-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; O3DIS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O3DIS-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; O3DIS-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; O3DIS-NEXT:    ret i32 [[TMP46]]
 ;
 entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
   %add = add nsw i32 %0, %N
-  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 64
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
 
 for.end:                                          ; preds = %for.body
-  %1 = load i32, i32* %a, align 4
+  %1 = load i32, ptr %a, align 4
   ret i32 %1
 }
 
-define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %N) {
 ; O1-LABEL: @nopragma(
 ; O1-NEXT:  entry:
 ; O1-NEXT:    br label [[FOR_BODY:%.*]]
 ; O1:       for.body:
 ; O1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; O1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; O1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; O1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; O1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; O1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; O1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; O1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
 ; O1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; O1:       for.end:
-; O1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; O1-NEXT:    ret i32 [[TMP1]]
 ;
 ; O2-LABEL: @nopragma(
 ; O2-NEXT:  entry:
 ; O2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; O2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; O2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; O2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; O2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; O2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; O2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; O2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; O2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; O2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; O2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; O2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; O2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; O2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; O2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; O2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; O2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; O2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; O2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; O2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; O2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; O2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; O2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; O2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; O2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; O2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; O2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; O2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; O2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; O2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; O2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; O2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; O2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; O2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; O2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; O2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; O2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; O2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; O2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; O2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; O2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; O2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; O2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; O2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; O2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; O2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; O2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; O2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; O2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; O2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; O2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; O2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; O2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; O2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; O2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; O2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; O2-NEXT:    ret i32 [[TMP78]]
+; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; O2-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; O2-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O2-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; O2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O2-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; O2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; O2-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O2-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; O2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; O2-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O2-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; O2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; O2-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O2-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; O2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O2-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; O2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; O2-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O2-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; O2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; O2-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O2-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; O2-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O2-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; O2-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O2-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; O2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O2-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; O2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; O2-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O2-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; O2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; O2-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O2-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; O2-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; O2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O2-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; O2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP46]]
 ;
 ; O3-LABEL: @nopragma(
 ; O3-NEXT:  entry:
 ; O3-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; O3-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; O3-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; O3-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; O3-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; O3-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; O3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; O3-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; O3-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; O3-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; O3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O3-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; O3-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; O3-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; O3-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; O3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; O3-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; O3-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; O3-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; O3-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; O3-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; O3-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; O3-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; O3-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; O3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; O3-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; O3-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; O3-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; O3-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; O3-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; O3-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; O3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; O3-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; O3-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; O3-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; O3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; O3-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; O3-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; O3-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; O3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; O3-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; O3-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; O3-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; O3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; O3-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; O3-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; O3-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; O3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; O3-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; O3-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; O3-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; O3-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; O3-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; O3-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; O3-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; O3-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; O3-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; O3-NEXT:    ret i32 [[TMP78]]
+; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; O3-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; O3-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; O3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; O3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; O3-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; O3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; O3-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; O3-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; O3-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; O3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; O3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; O3-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; O3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; O3-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; O3-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; O3-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; O3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O3-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; O3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; O3-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O3-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; O3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; O3-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O3-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; O3-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; O3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O3-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; O3-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP46]]
 ;
 ; O3DEFAULT-LABEL: @nopragma(
 ; O3DEFAULT-NEXT:  entry:
 ; O3DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; O3DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; O3DEFAULT-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; O3DEFAULT-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; O3DEFAULT-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; O3DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; O3DEFAULT-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; O3DEFAULT-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; O3DEFAULT-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; O3DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; O3DEFAULT-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; O3DEFAULT-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; O3DEFAULT-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; O3DEFAULT-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; O3DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; O3DEFAULT-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; O3DEFAULT-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; O3DEFAULT-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; O3DEFAULT-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; O3DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; O3DEFAULT-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; O3DEFAULT-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; O3DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; O3DEFAULT-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; O3DEFAULT-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; O3DEFAULT-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; O3DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; O3DEFAULT-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; O3DEFAULT-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; O3DEFAULT-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; O3DEFAULT-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; O3DEFAULT-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; O3DEFAULT-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; O3DEFAULT-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; O3DEFAULT-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; O3DEFAULT-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; O3DEFAULT-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; O3DEFAULT-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; O3DEFAULT-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; O3DEFAULT-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; O3DEFAULT-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; O3DEFAULT-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; O3DEFAULT-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; O3DEFAULT-NEXT:    ret i32 [[TMP78]]
+; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP46]]
 ;
 ; Os-LABEL: @nopragma(
 ; Os-NEXT:  entry:
 ; Os-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
 ; Os-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; Os-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; Os-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
-; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; Os-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; Os-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
-; Os-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; Os-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; Os-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; Os-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
-; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; Os-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; Os-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; Os-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
-; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; Os-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; Os-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; Os-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; Os-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; Os-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; Os-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
-; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; Os-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; Os-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; Os-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
-; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; Os-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; Os-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
-; Os-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; Os-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
-; Os-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; Os-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
-; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; Os-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
-; Os-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; Os-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
-; Os-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; Os-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
-; Os-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; Os-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
-; Os-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; Os-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
-; Os-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; Os-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
-; Os-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
-; Os-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
-; Os-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
-; Os-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
-; Os-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
-; Os-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
-; Os-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
-; Os-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
-; Os-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
-; Os-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; Os-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
-; Os-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
-; Os-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
-; Os-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
-; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; Os-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
-; Os-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; Os-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
-; Os-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
-; Os-NEXT:    ret i32 [[TMP78]]
+; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; Os-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
+; Os-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; Os-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; Os-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; Os-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; Os-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; Os-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; Os-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; Os-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; Os-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; Os-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; Os-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; Os-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; Os-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; Os-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; Os-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
+; Os-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
+; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; Os-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
+; Os-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
+; Os-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; Os-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
+; Os-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
+; Os-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; Os-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
+; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; Os-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; Os-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
+; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
+; Os-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; Os-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
+; Os-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
+; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; Os-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
+; Os-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
+; Os-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; Os-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
+; Os-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
+; Os-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; Os-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
+; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
+; Os-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
+; Os-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; Os-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
+; Os-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP46]]
 ;
 ; Oz-LABEL: @nopragma(
 ; Oz-NEXT:  entry:
 ; Oz-NEXT:    br label [[FOR_BODY:%.*]]
 ; Oz:       for.body:
 ; Oz-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; Oz-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; Oz-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Oz-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; Oz-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; Oz-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; Oz-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Oz-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; Oz-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; Oz-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
 ; Oz-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; Oz:       for.end:
-; Oz-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; Oz-NEXT:    ret i32 [[TMP1]]
 ;
 ; O1VEC2-LABEL: @nopragma(
@@ -1602,18 +1186,16 @@ define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; O1VEC2:       vector.body:
 ; O1VEC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; O1VEC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; O1VEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; O1VEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; O1VEC2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; O1VEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; O1VEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; O1VEC2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
+; O1VEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; O1VEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; O1VEC2-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; O1VEC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; O1VEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; O1VEC2-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4
 ; O1VEC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; O1VEC2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
-; O1VEC2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; O1VEC2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; O1VEC2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; O1VEC2:       middle.block:
 ; O1VEC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, 64
 ; O1VEC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1622,17 +1204,17 @@ define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; O1VEC2-NEXT:    br label [[FOR_BODY:%.*]]
 ; O1VEC2:       for.body:
 ; O1VEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; O1VEC2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; O1VEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[N]]
-; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; O1VEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; O1VEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], [[N]]
+; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; O1VEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; O1VEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
 ; O1VEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; O1VEC2:       for.end:
-; O1VEC2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A]], align 4
-; O1VEC2-NEXT:    ret i32 [[TMP10]]
+; O1VEC2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP8]]
 ;
 ; OzVEC2-LABEL: @nopragma(
 ; OzVEC2-NEXT:  entry:
@@ -1644,18 +1226,16 @@ define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; OzVEC2:       vector.body:
 ; OzVEC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; OzVEC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; OzVEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; OzVEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; OzVEC2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; OzVEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; OzVEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; OzVEC2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
+; OzVEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; OzVEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; OzVEC2-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; OzVEC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; OzVEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; OzVEC2-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4
 ; OzVEC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; OzVEC2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
-; OzVEC2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; OzVEC2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; OzVEC2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; OzVEC2:       middle.block:
 ; OzVEC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, 64
 ; OzVEC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1664,33 +1244,33 @@ define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; OzVEC2-NEXT:    br label [[FOR_BODY:%.*]]
 ; OzVEC2:       for.body:
 ; OzVEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; OzVEC2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; OzVEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[N]]
-; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; OzVEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; OzVEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], [[N]]
+; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; OzVEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; OzVEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
 ; OzVEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; OzVEC2:       for.end:
-; OzVEC2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A]], align 4
-; OzVEC2-NEXT:    ret i32 [[TMP10]]
+; OzVEC2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP8]]
 ;
 ; O3DIS-LABEL: @nopragma(
 ; O3DIS-NEXT:  entry:
 ; O3DIS-NEXT:    br label [[FOR_BODY:%.*]]
 ; O3DIS:       for.body:
 ; O3DIS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; O3DIS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; O3DIS-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3DIS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; O3DIS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; O3DIS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; O3DIS-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3DIS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; O3DIS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; O3DIS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
 ; O3DIS-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; O3DIS:       for.end:
-; O3DIS-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; O3DIS-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
@@ -1698,36 +1278,36 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
   %add = add nsw i32 %0, %N
-  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 64
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
-  %1 = load i32, i32* %a, align 4
+  %1 = load i32, ptr %a, align 4
   ret i32 %1
 }
 
-define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+define i32 @disabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %N) {
 ; O1-LABEL: @disabled(
 ; O1-NEXT:  entry:
 ; O1-NEXT:    br label [[FOR_BODY:%.*]]
 ; O1:       for.body:
 ; O1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; O1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; O1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; O1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; O1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; O1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; O1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; O1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
 ; O1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; O1:       for.end:
-; O1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; O1-NEXT:    ret i32 [[TMP1]]
 ;
 ; O2-LABEL: @disabled(
@@ -1735,16 +1315,16 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; O2-NEXT:    br label [[FOR_BODY:%.*]]
 ; O2:       for.body:
 ; O2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; O2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; O2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; O2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; O2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; O2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; O2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; O2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; O2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; O2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
 ; O2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; O2:       for.end:
-; O2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; O2-NEXT:    ret i32 [[TMP1]]
 ;
 ; O3-LABEL: @disabled(
@@ -1752,122 +1332,98 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; O3-NEXT:    br label [[FOR_BODY:%.*]]
 ; O3:       for.body:
 ; O3-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; O3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; O3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; O3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; O3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; O3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; O3-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; O3-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; O3-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; O3-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
 ; O3-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; O3:       for.end:
-; O3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; O3-NEXT:    ret i32 [[TMP1]]
 ;
 ; O3DEFAULT-LABEL: @disabled(
 ; O3DEFAULT-NEXT:  entry:
-; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; O3DEFAULT-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
-; O3DEFAULT-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; O3DEFAULT-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP1]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP4:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; O3DEFAULT-NEXT:    [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX_4]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; O3DEFAULT-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX2_4]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
-; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX_8]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; O3DEFAULT-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX2_8]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; O3DEFAULT-NEXT:    [[TMP13:%.*]] = bitcast i32* [[ARRAYIDX_12]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4
-; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP16:%.*]] = bitcast i32* [[ARRAYIDX2_12]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
-; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX_16]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4
-; O3DEFAULT-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[TMP18]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP20:%.*]] = bitcast i32* [[ARRAYIDX2_16]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP19]], <4 x i32>* [[TMP20]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
-; O3DEFAULT-NEXT:    [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX_20]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP21]], align 4
-; O3DEFAULT-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[TMP22]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX2_20]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP23]], <4 x i32>* [[TMP24]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
-; O3DEFAULT-NEXT:    [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX_24]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP26:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4
-; O3DEFAULT-NEXT:    [[TMP27:%.*]] = add nsw <4 x i32> [[TMP26]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP28:%.*]] = bitcast i32* [[ARRAYIDX2_24]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP27]], <4 x i32>* [[TMP28]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
-; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[ARRAYIDX_28]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP30:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; O3DEFAULT-NEXT:    [[TMP31:%.*]] = add nsw <4 x i32> [[TMP30]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP32:%.*]] = bitcast i32* [[ARRAYIDX2_28]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP31]], <4 x i32>* [[TMP32]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
-; O3DEFAULT-NEXT:    [[TMP33:%.*]] = bitcast i32* [[ARRAYIDX_32]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP34:%.*]] = load <4 x i32>, <4 x i32>* [[TMP33]], align 4
-; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[TMP34]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP36:%.*]] = bitcast i32* [[ARRAYIDX2_32]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP36]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
-; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[ARRAYIDX_36]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP38:%.*]] = load <4 x i32>, <4 x i32>* [[TMP37]], align 4
-; O3DEFAULT-NEXT:    [[TMP39:%.*]] = add nsw <4 x i32> [[TMP38]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP40:%.*]] = bitcast i32* [[ARRAYIDX2_36]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP39]], <4 x i32>* [[TMP40]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
-; O3DEFAULT-NEXT:    [[TMP41:%.*]] = bitcast i32* [[ARRAYIDX_40]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP42:%.*]] = load <4 x i32>, <4 x i32>* [[TMP41]], align 4
-; O3DEFAULT-NEXT:    [[TMP43:%.*]] = add nsw <4 x i32> [[TMP42]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP44:%.*]] = bitcast i32* [[ARRAYIDX2_40]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP43]], <4 x i32>* [[TMP44]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
-; O3DEFAULT-NEXT:    [[TMP45:%.*]] = bitcast i32* [[ARRAYIDX_44]] to <4 x i32>*
-; O3DEFAULT-NEXT:    [[TMP46:%.*]] = load <4 x i32>, <4 x i32>* [[TMP45]], align 4
-; O3DEFAULT-NEXT:    [[TMP47:%.*]] = add nsw <4 x i32> [[TMP46]], [[SHUFFLE]]
-; O3DEFAULT-NEXT:    [[TMP48:%.*]] = bitcast i32* [[ARRAYIDX2_44]] to <4 x i32>*
-; O3DEFAULT-NEXT:    store <4 x i32> [[TMP47]], <4 x i32>* [[TMP48]], align 4
-; O3DEFAULT-NEXT:    [[TMP49:%.*]] = load i32, i32* [[A]], align 4
-; O3DEFAULT-NEXT:    ret i32 [[TMP49]]
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP3]], ptr [[A:%.*]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_4]], align 4
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], ptr [[ARRAYIDX2_4]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_8]], align 4
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP7]], ptr [[ARRAYIDX2_8]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_12]], align 4
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP9]], ptr [[ARRAYIDX2_12]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_16]], align 4
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP11]], ptr [[ARRAYIDX2_16]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_20]], align 4
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP13]], ptr [[ARRAYIDX2_20]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_24]], align 4
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], ptr [[ARRAYIDX2_24]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_28]], align 4
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[TMP16]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP17]], ptr [[ARRAYIDX2_28]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_32]], align 4
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[TMP18]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP19]], ptr [[ARRAYIDX2_32]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_36]], align 4
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = add nsw <4 x i32> [[TMP20]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP21]], ptr [[ARRAYIDX2_36]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_40]], align 4
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[TMP22]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP23]], ptr [[ARRAYIDX2_40]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_44]], align 4
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[TMP24]], [[TMP2]]
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], ptr [[ARRAYIDX2_44]], align 4
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = load i32, ptr [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP26]]
 ;
 ; Os-LABEL: @disabled(
 ; Os-NEXT:  entry:
 ; Os-NEXT:    br label [[FOR_BODY:%.*]]
 ; Os:       for.body:
 ; Os-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; Os-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; Os-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Os-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; Os-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; Os-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; Os-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; Os-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Os-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; Os-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; Os-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; Os-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
 ; Os-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; Os:       for.end:
-; Os-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Os-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; Os-NEXT:    ret i32 [[TMP1]]
 ;
 ; Oz-LABEL: @disabled(
@@ -1875,16 +1431,16 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; Oz-NEXT:    br label [[FOR_BODY:%.*]]
 ; Oz:       for.body:
 ; Oz-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; Oz-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; Oz-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Oz-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; Oz-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; Oz-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; Oz-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Oz-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; Oz-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; Oz-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
 ; Oz-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; Oz:       for.end:
-; Oz-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; Oz-NEXT:    ret i32 [[TMP1]]
 ;
 ; O1VEC2-LABEL: @disabled(
@@ -1892,16 +1448,16 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; O1VEC2-NEXT:    br label [[FOR_BODY:%.*]]
 ; O1VEC2:       for.body:
 ; O1VEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; O1VEC2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; O1VEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; O1VEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; O1VEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; O1VEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
 ; O1VEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; O1VEC2:       for.end:
-; O1VEC2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; O1VEC2-NEXT:    ret i32 [[TMP1]]
 ;
 ; OzVEC2-LABEL: @disabled(
@@ -1909,16 +1465,16 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; OzVEC2-NEXT:    br label [[FOR_BODY:%.*]]
 ; OzVEC2:       for.body:
 ; OzVEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; OzVEC2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; OzVEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; OzVEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; OzVEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; OzVEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
 ; OzVEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; OzVEC2:       for.end:
-; OzVEC2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; OzVEC2-NEXT:    ret i32 [[TMP1]]
 ;
 ; O3DIS-LABEL: @disabled(
@@ -1926,16 +1482,16 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; O3DIS-NEXT:    br label [[FOR_BODY:%.*]]
 ; O3DIS:       for.body:
 ; O3DIS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; O3DIS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
-; O3DIS-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3DIS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; O3DIS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
-; O3DIS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; O3DIS-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3DIS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; O3DIS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; O3DIS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
 ; O3DIS-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; O3DIS:       for.end:
-; O3DIS-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
 ; O3DIS-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
@@ -1943,17 +1499,17 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
   %add = add nsw i32 %0, %N
-  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 48
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
 
 for.end:                                          ; preds = %for.body
-  %1 = load i32, i32* %a, align 4
+  %1 = load i32, ptr %a, align 4
   ret i32 %1
 }
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index 70e928778c73..f8b95f3dd6fb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -22,17 +22,15 @@ define i32 @foo_optsize() #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -40,11 +38,11 @@ define i32 @foo_optsize() #0 {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -63,17 +61,15 @@ define i32 @foo_optsize() #0 {
 ; AUTOVF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer
 ; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
-; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; AUTOVF-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
-; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
-; AUTOVF-NEXT:    [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AUTOVF-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AUTOVF-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
-; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP6]], <32 x i8>* [[TMP7]], i32 1, <32 x i1> [[TMP1]])
+; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[TMP0]]
+; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
+; AUTOVF-NEXT:    [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AUTOVF-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]])
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 32
-; AUTOVF-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
-; AUTOVF-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
+; AUTOVF-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AUTOVF:       middle.block:
 ; AUTOVF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; AUTOVF:       scalar.ph:
@@ -81,11 +77,11 @@ define i32 @foo_optsize() #0 {
 ; AUTOVF-NEXT:    br label [[FOR_BODY:%.*]]
 ; AUTOVF:       for.body:
 ; AUTOVF-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; AUTOVF-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AUTOVF-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
+; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]]
+; AUTOVF-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; AUTOVF-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0
 ; AUTOVF-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; AUTOVF-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; AUTOVF-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
 ; AUTOVF-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
 ; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -98,11 +94,11 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
-  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
   %cmp1 = icmp eq i8 %0, 0
   %. = select i1 %cmp1, i8 2, i8 1
-  store i8 %., i8* %arrayidx, align 1
+  store i8 %., ptr %arrayidx, align 1
   %inc = add nsw i32 %i.08, 1
   %exitcond = icmp eq i32 %i.08, 202
   br i1 %exitcond, label %for.end, label %for.body
@@ -126,17 +122,15 @@ define i32 @foo_minsize() #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -144,11 +138,11 @@ define i32 @foo_minsize() #1 {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -167,17 +161,15 @@ define i32 @foo_minsize() #1 {
 ; AUTOVF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer
 ; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
-; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; AUTOVF-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
-; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
-; AUTOVF-NEXT:    [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AUTOVF-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; AUTOVF-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
-; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP6]], <32 x i8>* [[TMP7]], i32 1, <32 x i1> [[TMP1]])
+; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[TMP0]]
+; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
+; AUTOVF-NEXT:    [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AUTOVF-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]])
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 32
-; AUTOVF-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
-; AUTOVF-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
+; AUTOVF-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AUTOVF:       middle.block:
 ; AUTOVF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; AUTOVF:       scalar.ph:
@@ -185,11 +177,11 @@ define i32 @foo_minsize() #1 {
 ; AUTOVF-NEXT:    br label [[FOR_BODY:%.*]]
 ; AUTOVF:       for.body:
 ; AUTOVF-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; AUTOVF-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AUTOVF-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
+; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]]
+; AUTOVF-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; AUTOVF-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0
 ; AUTOVF-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; AUTOVF-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; AUTOVF-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
 ; AUTOVF-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
 ; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -202,11 +194,11 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
-  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
   %cmp1 = icmp eq i8 %0, 0
   %. = select i1 %cmp1, i8 2, i8 1
-  store i8 %., i8* %arrayidx, align 1
+  store i8 %., ptr %arrayidx, align 1
   %inc = add nsw i32 %i.08, 1
   %exitcond = icmp eq i32 %i.08, 202
   br i1 %exitcond, label %for.end, label %for.body
@@ -219,7 +211,7 @@ attributes #1 = { minsize }
 
 
 ; We can vectorize this one by refraining from versioning for stride==1.
-define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
+define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %k) #2 {
 ; CHECK-LABEL: @scev4stride1(
 ; CHECK-NEXT:  for.body.preheader:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -232,16 +224,15 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <64 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <64 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <64 x i32> [[TMP1]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0i32(<64 x i32*> [[TMP2]], i32 4, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i32> poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <64 x i32>*
-; CHECK-NEXT:    store <64 x i32> [[WIDE_MASKED_GATHER]], <64 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], <64 x i32> [[TMP1]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0(<64 x ptr> [[TMP2]], i32 4, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <64 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <64 x i32> [[VEC_IND]], <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 256, 256
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -251,10 +242,10 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[MUL]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]]
-; CHECK-NEXT:    store i32 [[TMP7]], i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[MUL]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_07]]
+; CHECK-NEXT:    store i32 [[TMP6]], ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -273,16 +264,15 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read
 ; AUTOVF-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AUTOVF-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = mul nsw <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <8 x i32> [[TMP1]]
-; AUTOVF-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
-; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; AUTOVF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; AUTOVF-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
-; AUTOVF-NEXT:    store <8 x i32> [[WIDE_MASKED_GATHER]], <8 x i32>* [[TMP5]], align 4
+; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], <8 x i32> [[TMP1]]
+; AUTOVF-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
+; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; AUTOVF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; AUTOVF-NEXT:    store <8 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP4]], align 4
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; AUTOVF-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; AUTOVF-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; AUTOVF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; AUTOVF-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AUTOVF:       middle.block:
 ; AUTOVF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 256, 256
 ; AUTOVF-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -292,10 +282,10 @@ define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture read
 ; AUTOVF:       for.body:
 ; AUTOVF-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; AUTOVF-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]]
-; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[MUL]]
-; AUTOVF-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AUTOVF-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]]
-; AUTOVF-NEXT:    store i32 [[TMP7]], i32* [[ARRAYIDX1]], align 4
+; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[MUL]]
+; AUTOVF-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; AUTOVF-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_07]]
+; AUTOVF-NEXT:    store i32 [[TMP6]], ptr [[ARRAYIDX1]], align 4
 ; AUTOVF-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
 ; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
 ; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -308,10 +298,10 @@ for.body.preheader:
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
   %mul = mul nsw i32 %i.07, %k
-  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
-  store i32 %0, i32* %arrayidx1, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %b, i32 %mul
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %a, i32 %i.07
+  store i32 %0, ptr %arrayidx1, align 4
   %inc = add nuw nsw i32 %i.07, 1
   %exitcond = icmp eq i32 %inc, 256
   br i1 %exitcond, label %for.end.loopexit, label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
index 5789d86c9a19..96046ddfe9a0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -9,7 +9,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-define void @small_tc(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: @small_tc(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -18,19 +18,16 @@ define void @small_tc(float* noalias nocapture %A, float* noalias nocapture read
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 8, 8
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -39,15 +36,15 @@ define void @small_tc(float* noalias nocapture %A, float* noalias nocapture read
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP3:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -56,12 +53,12 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !5
-  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !5
+  %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4, !llvm.access.group !5
+  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4, !llvm.access.group !5
   %add = fadd fast float %0, %1
-  store float %add, float* %arrayidx2, align 4, !llvm.access.group !5
+  store float %add, ptr %arrayidx2, align 4, !llvm.access.group !5
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 8
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
index e2b4e4f3e151..36dee12e4d7d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -8,104 +8,100 @@ target triple = "x86_64-unknown-linux-gnu"
 
 @a = common local_unnamed_addr global [192 x [192 x i32]] zeroinitializer, align 16
 
-define i32 @main(i32* %ptr) {
+define i32 @main(ptr %ptr) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[S:%.*]] = alloca i16, align 2
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[I]] to i8*
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP0]])
-; CHECK-NEXT:    store i32 0, i32* [[I]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[S]] to i8*
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull [[TMP1]])
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 (i32*, ...) bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull [[I]])
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I]], align 4
-; CHECK-NEXT:    [[STOREMERGE6:%.*]] = trunc i32 [[TMP2]] to i16
-; CHECK-NEXT:    store i16 [[STOREMERGE6]], i16* [[S]], align 2
-; CHECK-NEXT:    [[CONV17:%.*]] = and i32 [[TMP2]], 65472
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[I]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[S]])
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr, ...) @goo(ptr nonnull [[I]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+; CHECK-NEXT:    [[STOREMERGE6:%.*]] = trunc i32 [[TMP0]] to i16
+; CHECK-NEXT:    store i16 [[STOREMERGE6]], ptr [[S]], align 2
+; CHECK-NEXT:    [[CONV17:%.*]] = and i32 [[TMP0]], 65472
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[CONV17]], 0
 ; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END12:%.*]]
 ; CHECK:       for.body.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[STOREMERGE_IN9:%.*]] = phi i32 [ [[TMP2]], [[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], [[FOR_INC9:%.*]] ]
+; CHECK-NEXT:    [[STOREMERGE_IN9:%.*]] = phi i32 [ [[TMP0]], [[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], [[FOR_INC9:%.*]] ]
 ; CHECK-NEXT:    [[CONV52:%.*]] = and i32 [[STOREMERGE_IN9]], 255
-; CHECK-NEXT:    [[CMP63:%.*]] = icmp ult i32 [[TMP2]], [[CONV52]]
+; CHECK-NEXT:    [[CMP63:%.*]] = icmp ult i32 [[TMP0]], [[CONV52]]
 ; CHECK-NEXT:    br i1 [[CMP63]], label [[FOR_BODY8_LR_PH:%.*]], label [[FOR_INC9]]
 ; CHECK:       for.body8.lr.ph:
 ; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[STOREMERGE_IN9]] to i8
-; CHECK-NEXT:    [[DOTPROMOTED:%.*]] = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[CONV3]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 1
-; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP5]], [[UMIN1]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 32
+; CHECK-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr @a, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[CONV3]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], [[UMIN1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP4]], 32
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    [[TMP7:%.*]] = add i8 [[CONV3]], -1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
-; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 [[TMP8]])
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], [[UMIN]]
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[TMP9]] to i8
-; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP10]])
+; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[CONV3]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP6]], [[UMIN]]
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP8]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i8 [[TMP7]], [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i8 [[TMP11]], [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[TMP12]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ugt i32 [[TMP9]], 255
-; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[DOTPROMOTED]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], [[TMP9]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp slt i32 [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i1 [[TMP15]], [[TMP18]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i8 [[TMP5]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i8 [[TMP9]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i32 [[TMP7]], 255
+; CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[DOTPROMOTED]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp slt i32 [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP6]], 8
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP6]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP4]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP4]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[DOTPROMOTED]], [[N_VEC]]
-; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
-; CHECK-NEXT:    [[IND_END3:%.*]] = sub i8 [[CONV3]], [[CAST_CRD]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; CHECK-NEXT:    [[IND_END2:%.*]] = sub i8 [[CONV3]], [[DOTCAST]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[DOTPROMOTED]], [[INDEX]]
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP21]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP25]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[TMP26]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP32]], align 4
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 4
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, <4 x i32>* [[TMP34]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP19]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP4]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPROMOTED]], [[FOR_BODY8_LR_PH]] ], [ [[DOTPROMOTED]], [[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i8 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[FOR_BODY8_LR_PH]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i8 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[FOR_BODY8_LR_PH]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY8:%.*]]
 ; CHECK:       for.body8:
 ; CHECK-NEXT:    [[INC5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY8]] ]
-; CHECK-NEXT:    [[C_04:%.*]] = phi i8 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[FOR_BODY8]] ]
+; CHECK-NEXT:    [[C_04:%.*]] = phi i8 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[FOR_BODY8]] ]
 ; CHECK-NEXT:    [[INC]] = add i32 [[INC5]], 1
 ; CHECK-NEXT:    [[DEC]] = add i8 [[C_04]], -1
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[INC]]
-; CHECK-NEXT:    store i32 0, i32* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[INC]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[DEC]] to i32
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp ult i32 [[TMP2]], [[CONV5]]
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp ult i32 [[TMP0]], [[CONV5]]
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY8]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       for.cond4.for.inc9_crit_edge:
 ; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_BODY8]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    store i32 [[INC_LCSSA]], i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+; CHECK-NEXT:    store i32 [[INC_LCSSA]], ptr @a, align 16
 ; CHECK-NEXT:    br label [[FOR_INC9]]
 ; CHECK:       for.inc9:
 ; CHECK-NEXT:    [[CONV10:%.*]] = and i32 [[STOREMERGE_IN9]], 65535
@@ -116,27 +112,25 @@ define i32 @main(i32* %ptr) {
 ; CHECK:       for.cond.for.end12_crit_edge:
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INC9]] ]
 ; CHECK-NEXT:    [[STOREMERGE:%.*]] = trunc i32 [[ADD_LCSSA]] to i16
-; CHECK-NEXT:    store i16 [[STOREMERGE]], i16* [[S]], align 2
+; CHECK-NEXT:    store i16 [[STOREMERGE]], ptr [[S]], align 2
 ; CHECK-NEXT:    br label [[FOR_END12]]
 ; CHECK:       for.end12:
-; CHECK-NEXT:    [[CALL13:%.*]] = call i32 (i16*, ...) bitcast (i32 (...)* @foo to i32 (i16*, ...)*)(i16* nonnull [[S]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull [[TMP1]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    [[CALL13:%.*]] = call i32 (ptr, ...) @foo(ptr nonnull [[S]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[S]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[I]])
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
   %i = alloca i32, align 4
   %s = alloca i16, align 2
-  %0 = bitcast i32* %i to i8*
-  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #3
-  store i32 0, i32* %i, align 4
-  %1 = bitcast i16* %s to i8*
-  call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull %1) #3
-  %call = call i32 (i32*, ...) bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i) #3
-  %2 = load i32, i32* %i, align 4
-  %storemerge6 = trunc i32 %2 to i16
-  store i16 %storemerge6, i16* %s, align 2
-  %conv17 = and i32 %2, 65472
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %i) #3
+  store i32 0, ptr %i, align 4
+  call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %s) #3
+  %call = call i32 (ptr, ...) @goo(ptr nonnull %i) #3
+  %0 = load i32, ptr %i, align 4
+  %storemerge6 = trunc i32 %0 to i16
+  store i16 %storemerge6, ptr %s, align 2
+  %conv17 = and i32 %0, 65472
   %cmp8 = icmp eq i32 %conv17, 0
   br i1 %cmp8, label %for.body.lr.ph, label %for.end12
 
@@ -144,14 +138,14 @@ for.body.lr.ph:                                   ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body.lr.ph, %for.inc9
-  %storemerge.in9 = phi i32 [ %2, %for.body.lr.ph ], [ %add, %for.inc9 ]
+  %storemerge.in9 = phi i32 [ %0, %for.body.lr.ph ], [ %add, %for.inc9 ]
   %conv52 = and i32 %storemerge.in9, 255
-  %cmp63 = icmp ult i32 %2, %conv52
+  %cmp63 = icmp ult i32 %0, %conv52
   br i1 %cmp63, label %for.body8.lr.ph, label %for.inc9
 
 for.body8.lr.ph:                                  ; preds = %for.body
   %conv3 = trunc i32 %storemerge.in9 to i8
-  %.promoted = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+  %.promoted = load i32, ptr @a, align 16
   br label %for.body8
 
 for.body8:                                        ; preds = %for.body8.lr.ph, %for.body8
@@ -159,15 +153,15 @@ for.body8:                                        ; preds = %for.body8.lr.ph, %f
   %c.04 = phi i8 [ %conv3, %for.body8.lr.ph ], [ %dec, %for.body8 ]
   %inc = add i32 %inc5, 1
   %dec = add i8 %c.04, -1
-  %gep = getelementptr inbounds i32, i32* %ptr, i32 %inc
-  store i32 0, i32* %gep
+  %gep = getelementptr inbounds i32, ptr %ptr, i32 %inc
+  store i32 0, ptr %gep
   %conv5 = zext i8 %dec to i32
-  %cmp6 = icmp ult i32 %2, %conv5
+  %cmp6 = icmp ult i32 %0, %conv5
   br i1 %cmp6, label %for.body8, label %for.cond4.for.inc9_crit_edge
 
 for.cond4.for.inc9_crit_edge:                     ; preds = %for.body8
   %inc.lcssa = phi i32 [ %inc, %for.body8 ]
-  store i32 %inc.lcssa, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+  store i32 %inc.lcssa, ptr @a, align 16
   br label %for.inc9
 
 for.inc9:                                         ; preds = %for.cond4.for.inc9_crit_edge, %for.body
@@ -180,22 +174,22 @@ for.inc9:                                         ; preds = %for.cond4.for.inc9_
 for.cond.for.end12_crit_edge:                     ; preds = %for.inc9
   %add.lcssa = phi i32 [ %add, %for.inc9 ]
   %storemerge = trunc i32 %add.lcssa to i16
-  store i16 %storemerge, i16* %s, align 2
+  store i16 %storemerge, ptr %s, align 2
   br label %for.end12
 
 for.end12:                                        ; preds = %for.cond.for.end12_crit_edge, %entry
-  %call13 = call i32 (i16*, ...) bitcast (i32 (...)* @foo to i32 (i16*, ...)*)(i16* nonnull %s) #3
-  call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull %1) #3
-  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #3
+  %call13 = call i32 (ptr, ...) @foo(ptr nonnull %s) #3
+  call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %s) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %i) #3
   ret i32 0
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
 
 declare i32 @goo(...) local_unnamed_addr #2
 
 declare i32 @foo(...) local_unnamed_addr #2
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index f133265cce49..7ace7fd2c099 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -5,7 +5,7 @@
 ; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx2   | FileCheck %s --check-prefix=AVX2
 ; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm      | FileCheck %s --check-prefix=SSE2
 
-define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture readonly %s1, i16* noalias nocapture readonly %s2, i32 %n) {
+define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, i32 %n) {
 ; SSE2-LABEL: @test_muladd(
 ; SSE2-NEXT:  entry:
 ; SSE2-NEXT:    [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -22,32 +22,29 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; SSE2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; SSE2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
-; SSE2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i64 [[TMP1]]
-; SSE2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0
-; SSE2-NEXT:    [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>*
-; SSE2-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 2
+; SSE2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP1]]
+; SSE2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
+; SSE2-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
 ; SSE2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE2-NEXT:    [[TMP5:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
-; SSE2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[S2:%.*]], i64 [[TMP1]]
-; SSE2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
-; SSE2-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <8 x i16>*
-; SSE2-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 2
+; SSE2-NEXT:    [[TMP4:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
+; SSE2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP1]]
+; SSE2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; SSE2-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2
 ; SSE2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE2-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
-; SSE2-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP9]], [[TMP5]]
-; SSE2-NEXT:    [[TMP13:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32>
-; SSE2-NEXT:    [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
-; SSE2-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP13]]
-; SSE2-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[TMP16]], [[TMP10]]
-; SSE2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; SSE2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; SSE2-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <4 x i32>*
-; SSE2-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* [[TMP20]], align 4
+; SSE2-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
+; SSE2-NEXT:    [[TMP8:%.*]] = mul nsw <4 x i32> [[TMP7]], [[TMP4]]
+; SSE2-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32>
+; SSE2-NEXT:    [[TMP10:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
+; SSE2-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP9]]
+; SSE2-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP8]]
+; SSE2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]]
+; SSE2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; SSE2-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP14]], align 4
 ; SSE2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SSE2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SSE2-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SSE2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SSE2-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SSE2:       middle.block:
 ; SSE2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; SSE2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -56,25 +53,25 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; SSE2-NEXT:    br label [[FOR_BODY:%.*]]
 ; SSE2:       for.body:
 ; SSE2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SSE2-NEXT:    [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; SSE2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]]
-; SSE2-NEXT:    [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; SSE2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP23]] to i32
-; SSE2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]]
-; SSE2-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; SSE2-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP24]] to i32
+; SSE2-NEXT:    [[TMP16:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; SSE2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP16]]
+; SSE2-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; SSE2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP17]] to i32
+; SSE2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP16]]
+; SSE2-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
+; SSE2-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP18]] to i32
 ; SSE2-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; SSE2-NEXT:    [[TMP25:%.*]] = or i64 [[TMP22]], 1
-; SSE2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP25]]
-; SSE2-NEXT:    [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; SSE2-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP26]] to i32
-; SSE2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP25]]
-; SSE2-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; SSE2-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP27]] to i32
+; SSE2-NEXT:    [[TMP19:%.*]] = or i64 [[TMP16]], 1
+; SSE2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP19]]
+; SSE2-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; SSE2-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP20]] to i32
+; SSE2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP19]]
+; SSE2-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
+; SSE2-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP21]] to i32
 ; SSE2-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
 ; SSE2-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
-; SSE2-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
-; SSE2-NEXT:    store i32 [[ADD18]], i32* [[ARRAYIDX20]], align 4
+; SSE2-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]]
+; SSE2-NEXT:    store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
 ; SSE2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; SSE2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; SSE2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -101,55 +98,49 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; SSE41-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
 ; SSE41-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP0]], 1
 ; SSE41-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1
-; SSE41-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i64 [[TMP2]]
-; SSE41-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP3]]
-; SSE41-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i32 0
-; SSE41-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <8 x i16>*
-; SSE41-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; SSE41-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <8 x i16>*
-; SSE41-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP7]], align 2
-; SSE41-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP9]], align 2
+; SSE41-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP2]]
+; SSE41-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP3]]
+; SSE41-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
+; SSE41-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; SSE41-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2
+; SSE41-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
 ; SSE41-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE41-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE41-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; SSE41-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE41-NEXT:    [[TMP10:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
-; SSE41-NEXT:    [[TMP11:%.*]] = sext <4 x i16> [[STRIDED_VEC2]] to <4 x i32>
-; SSE41-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[S2:%.*]], i64 [[TMP2]]
-; SSE41-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP3]]
-; SSE41-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP12]], i32 0
-; SSE41-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <8 x i16>*
-; SSE41-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[TMP13]], i32 0
-; SSE41-NEXT:    [[TMP17:%.*]] = bitcast i16* [[TMP16]] to <8 x i16>*
-; SSE41-NEXT:    [[WIDE_VEC5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP15]], align 2
-; SSE41-NEXT:    [[WIDE_VEC6:%.*]] = load <8 x i16>, <8 x i16>* [[TMP17]], align 2
+; SSE41-NEXT:    [[TMP8:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
+; SSE41-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC2]] to <4 x i32>
+; SSE41-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP2]]
+; SSE41-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP3]]
+; SSE41-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
+; SSE41-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
+; SSE41-NEXT:    [[WIDE_VEC5:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
+; SSE41-NEXT:    [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2
 ; SSE41-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE41-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE41-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; SSE41-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE41-NEXT:    [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
-; SSE41-NEXT:    [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
-; SSE41-NEXT:    [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP18]], [[TMP10]]
-; SSE41-NEXT:    [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP11]]
-; SSE41-NEXT:    [[TMP26:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
-; SSE41-NEXT:    [[TMP27:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
-; SSE41-NEXT:    [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
-; SSE41-NEXT:    [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
-; SSE41-NEXT:    [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP26]]
-; SSE41-NEXT:    [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP27]]
-; SSE41-NEXT:    [[TMP34:%.*]] = add nsw <4 x i32> [[TMP32]], [[TMP20]]
-; SSE41-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[TMP33]], [[TMP21]]
-; SSE41-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; SSE41-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]]
-; SSE41-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
-; SSE41-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; SSE41-NEXT:    store <4 x i32> [[TMP34]], <4 x i32>* [[TMP39]], align 4
-; SSE41-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 4
-; SSE41-NEXT:    [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <4 x i32>*
-; SSE41-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP41]], align 4
+; SSE41-NEXT:    [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
+; SSE41-NEXT:    [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
+; SSE41-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP8]]
+; SSE41-NEXT:    [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP9]]
+; SSE41-NEXT:    [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
+; SSE41-NEXT:    [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
+; SSE41-NEXT:    [[TMP20:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
+; SSE41-NEXT:    [[TMP21:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
+; SSE41-NEXT:    [[TMP22:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP18]]
+; SSE41-NEXT:    [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP19]]
+; SSE41-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[TMP22]], [[TMP16]]
+; SSE41-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP17]]
+; SSE41-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]]
+; SSE41-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP1]]
+; SSE41-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
+; SSE41-NEXT:    store <4 x i32> [[TMP24]], ptr [[TMP28]], align 4
+; SSE41-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 4
+; SSE41-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP29]], align 4
 ; SSE41-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SSE41-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SSE41-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SSE41-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SSE41-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SSE41:       middle.block:
 ; SSE41-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; SSE41-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -158,25 +149,25 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; SSE41-NEXT:    br label [[FOR_BODY:%.*]]
 ; SSE41:       for.body:
 ; SSE41-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SSE41-NEXT:    [[TMP43:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; SSE41-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP43]]
-; SSE41-NEXT:    [[TMP44:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; SSE41-NEXT:    [[CONV:%.*]] = sext i16 [[TMP44]] to i32
-; SSE41-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP43]]
-; SSE41-NEXT:    [[TMP45:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; SSE41-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP45]] to i32
+; SSE41-NEXT:    [[TMP31:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; SSE41-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP31]]
+; SSE41-NEXT:    [[TMP32:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; SSE41-NEXT:    [[CONV:%.*]] = sext i16 [[TMP32]] to i32
+; SSE41-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP31]]
+; SSE41-NEXT:    [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
+; SSE41-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP33]] to i32
 ; SSE41-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; SSE41-NEXT:    [[TMP46:%.*]] = or i64 [[TMP43]], 1
-; SSE41-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP46]]
-; SSE41-NEXT:    [[TMP47:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; SSE41-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP47]] to i32
-; SSE41-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP46]]
-; SSE41-NEXT:    [[TMP48:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; SSE41-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP48]] to i32
+; SSE41-NEXT:    [[TMP34:%.*]] = or i64 [[TMP31]], 1
+; SSE41-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP34]]
+; SSE41-NEXT:    [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; SSE41-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP35]] to i32
+; SSE41-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP34]]
+; SSE41-NEXT:    [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
+; SSE41-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP36]] to i32
 ; SSE41-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
 ; SSE41-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
-; SSE41-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
-; SSE41-NEXT:    store i32 [[ADD18]], i32* [[ARRAYIDX20]], align 4
+; SSE41-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]]
+; SSE41-NEXT:    store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
 ; SSE41-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; SSE41-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; SSE41-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -207,22 +198,18 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; AVX1-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 ; AVX1-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP2]], 1
 ; AVX1-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP3]], 1
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i64 [[TMP4]]
-; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP5]]
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP6]]
-; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP7]]
-; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP8]], i32 0
-; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i16* [[TMP12]] to <8 x i16>*
-; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i32 0
-; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <8 x i16>*
-; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 0
-; AVX1-NEXT:    [[TMP17:%.*]] = bitcast i16* [[TMP16]] to <8 x i16>*
-; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP11]], i32 0
-; AVX1-NEXT:    [[TMP19:%.*]] = bitcast i16* [[TMP18]] to <8 x i16>*
-; AVX1-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP13]], align 2
-; AVX1-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP15]], align 2
-; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP17]], align 2
-; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i16>, <8 x i16>* [[TMP19]], align 2
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP4]]
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]]
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP7]]
+; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
+; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
+; AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
+; AVX1-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
+; AVX1-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2
+; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2
+; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -231,26 +218,22 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; AVX1-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP20:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
-; AVX1-NEXT:    [[TMP21:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
-; AVX1-NEXT:    [[TMP22:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32>
-; AVX1-NEXT:    [[TMP23:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32>
-; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, i16* [[S2:%.*]], i64 [[TMP4]]
-; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP5]]
-; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP6]]
-; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP7]]
-; AVX1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i16, i16* [[TMP24]], i32 0
-; AVX1-NEXT:    [[TMP29:%.*]] = bitcast i16* [[TMP28]] to <8 x i16>*
-; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i16, i16* [[TMP25]], i32 0
-; AVX1-NEXT:    [[TMP31:%.*]] = bitcast i16* [[TMP30]] to <8 x i16>*
-; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[TMP26]], i32 0
-; AVX1-NEXT:    [[TMP33:%.*]] = bitcast i16* [[TMP32]] to <8 x i16>*
-; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i16, i16* [[TMP27]], i32 0
-; AVX1-NEXT:    [[TMP35:%.*]] = bitcast i16* [[TMP34]] to <8 x i16>*
-; AVX1-NEXT:    [[WIDE_VEC11:%.*]] = load <8 x i16>, <8 x i16>* [[TMP29]], align 2
-; AVX1-NEXT:    [[WIDE_VEC12:%.*]] = load <8 x i16>, <8 x i16>* [[TMP31]], align 2
-; AVX1-NEXT:    [[WIDE_VEC13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP33]], align 2
-; AVX1-NEXT:    [[WIDE_VEC14:%.*]] = load <8 x i16>, <8 x i16>* [[TMP35]], align 2
+; AVX1-NEXT:    [[TMP16:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
+; AVX1-NEXT:    [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
+; AVX1-NEXT:    [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32>
+; AVX1-NEXT:    [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32>
+; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP4]]
+; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]]
+; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]]
+; AVX1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP7]]
+; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[TMP20]], i32 0
+; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0
+; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0
+; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[TMP23]], i32 0
+; AVX1-NEXT:    [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP24]], align 2
+; AVX1-NEXT:    [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2
+; AVX1-NEXT:    [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2
+; AVX1-NEXT:    [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -259,49 +242,45 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; AVX1-NEXT:    [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32>
-; AVX1-NEXT:    [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32>
-; AVX1-NEXT:    [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32>
-; AVX1-NEXT:    [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32>
-; AVX1-NEXT:    [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP36]], [[TMP20]]
-; AVX1-NEXT:    [[TMP41:%.*]] = mul nsw <4 x i32> [[TMP37]], [[TMP21]]
-; AVX1-NEXT:    [[TMP42:%.*]] = mul nsw <4 x i32> [[TMP38]], [[TMP22]]
-; AVX1-NEXT:    [[TMP43:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP23]]
-; AVX1-NEXT:    [[TMP52:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
-; AVX1-NEXT:    [[TMP53:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
-; AVX1-NEXT:    [[TMP54:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
-; AVX1-NEXT:    [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
-; AVX1-NEXT:    [[TMP60:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
-; AVX1-NEXT:    [[TMP61:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
-; AVX1-NEXT:    [[TMP62:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
-; AVX1-NEXT:    [[TMP63:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
-; AVX1-NEXT:    [[TMP64:%.*]] = mul nsw <4 x i32> [[TMP60]], [[TMP52]]
-; AVX1-NEXT:    [[TMP65:%.*]] = mul nsw <4 x i32> [[TMP61]], [[TMP53]]
-; AVX1-NEXT:    [[TMP66:%.*]] = mul nsw <4 x i32> [[TMP62]], [[TMP54]]
-; AVX1-NEXT:    [[TMP67:%.*]] = mul nsw <4 x i32> [[TMP63]], [[TMP55]]
-; AVX1-NEXT:    [[TMP68:%.*]] = add nsw <4 x i32> [[TMP64]], [[TMP40]]
-; AVX1-NEXT:    [[TMP69:%.*]] = add nsw <4 x i32> [[TMP65]], [[TMP41]]
-; AVX1-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[TMP66]], [[TMP42]]
-; AVX1-NEXT:    [[TMP71:%.*]] = add nsw <4 x i32> [[TMP67]], [[TMP43]]
-; AVX1-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 0
-; AVX1-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
-; AVX1-NEXT:    store <4 x i32> [[TMP68]], <4 x i32>* [[TMP77]], align 4
-; AVX1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 4
-; AVX1-NEXT:    [[TMP79:%.*]] = bitcast i32* [[TMP78]] to <4 x i32>*
-; AVX1-NEXT:    store <4 x i32> [[TMP69]], <4 x i32>* [[TMP79]], align 4
-; AVX1-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 8
-; AVX1-NEXT:    [[TMP81:%.*]] = bitcast i32* [[TMP80]] to <4 x i32>*
-; AVX1-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP81]], align 4
-; AVX1-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 12
-; AVX1-NEXT:    [[TMP83:%.*]] = bitcast i32* [[TMP82]] to <4 x i32>*
-; AVX1-NEXT:    store <4 x i32> [[TMP71]], <4 x i32>* [[TMP83]], align 4
+; AVX1-NEXT:    [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32>
+; AVX1-NEXT:    [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32>
+; AVX1-NEXT:    [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32>
+; AVX1-NEXT:    [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32>
+; AVX1-NEXT:    [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP16]]
+; AVX1-NEXT:    [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP29]], [[TMP17]]
+; AVX1-NEXT:    [[TMP34:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP18]]
+; AVX1-NEXT:    [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP19]]
+; AVX1-NEXT:    [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
+; AVX1-NEXT:    [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
+; AVX1-NEXT:    [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
+; AVX1-NEXT:    [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
+; AVX1-NEXT:    [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
+; AVX1-NEXT:    [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
+; AVX1-NEXT:    [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
+; AVX1-NEXT:    [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
+; AVX1-NEXT:    [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]]
+; AVX1-NEXT:    [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]]
+; AVX1-NEXT:    [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]]
+; AVX1-NEXT:    [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]]
+; AVX1-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[TMP44]], [[TMP32]]
+; AVX1-NEXT:    [[TMP49:%.*]] = add nsw <4 x i32> [[TMP45]], [[TMP33]]
+; AVX1-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP34]]
+; AVX1-NEXT:    [[TMP51:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP35]]
+; AVX1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP3]]
+; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 0
+; AVX1-NEXT:    store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4
+; AVX1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 4
+; AVX1-NEXT:    store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4
+; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 8
+; AVX1-NEXT:    store <4 x i32> [[TMP50]], ptr [[TMP58]], align 4
+; AVX1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 12
+; AVX1-NEXT:    store <4 x i32> [[TMP51]], ptr [[TMP59]], align 4
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX1-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX1-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX1-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -310,25 +289,25 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX1-NEXT:    [[TMP85:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP85]]
-; AVX1-NEXT:    [[TMP86:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; AVX1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP86]] to i32
-; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP85]]
-; AVX1-NEXT:    [[TMP87:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; AVX1-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP87]] to i32
+; AVX1-NEXT:    [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]]
+; AVX1-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; AVX1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP62]] to i32
+; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP61]]
+; AVX1-NEXT:    [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
+; AVX1-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP63]] to i32
 ; AVX1-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; AVX1-NEXT:    [[TMP88:%.*]] = or i64 [[TMP85]], 1
-; AVX1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP88]]
-; AVX1-NEXT:    [[TMP89:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; AVX1-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP89]] to i32
-; AVX1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP88]]
-; AVX1-NEXT:    [[TMP90:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; AVX1-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP90]] to i32
+; AVX1-NEXT:    [[TMP64:%.*]] = or i64 [[TMP61]], 1
+; AVX1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP64]]
+; AVX1-NEXT:    [[TMP65:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; AVX1-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP65]] to i32
+; AVX1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP64]]
+; AVX1-NEXT:    [[TMP66:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
+; AVX1-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP66]] to i32
 ; AVX1-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
 ; AVX1-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
-; AVX1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    store i32 [[ADD18]], i32* [[ARRAYIDX20]], align 4
+; AVX1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; AVX1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -353,32 +332,29 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; AVX2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
-; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <16 x i16>*
-; AVX2-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i16>, <16 x i16>* [[TMP4]], align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
+; AVX2-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2
 ; AVX2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; AVX2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX2-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i32>
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[S2:%.*]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <16 x i16>*
-; AVX2-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i16>, <16 x i16>* [[TMP8]], align 2
+; AVX2-NEXT:    [[TMP4:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i32>
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; AVX2-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP6]], align 2
 ; AVX2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; AVX2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX2-NEXT:    [[TMP9:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i32>
-; AVX2-NEXT:    [[TMP10:%.*]] = mul nsw <8 x i32> [[TMP9]], [[TMP5]]
-; AVX2-NEXT:    [[TMP13:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i32>
-; AVX2-NEXT:    [[TMP15:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i32>
-; AVX2-NEXT:    [[TMP16:%.*]] = mul nsw <8 x i32> [[TMP15]], [[TMP13]]
-; AVX2-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[TMP16]], [[TMP10]]
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; AVX2-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <8 x i32>*
-; AVX2-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP20]], align 4
+; AVX2-NEXT:    [[TMP7:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i32>
+; AVX2-NEXT:    [[TMP8:%.*]] = mul nsw <8 x i32> [[TMP7]], [[TMP4]]
+; AVX2-NEXT:    [[TMP9:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i32>
+; AVX2-NEXT:    [[TMP10:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i32>
+; AVX2-NEXT:    [[TMP11:%.*]] = mul nsw <8 x i32> [[TMP10]], [[TMP9]]
+; AVX2-NEXT:    [[TMP12:%.*]] = add nsw <8 x i32> [[TMP11]], [[TMP8]]
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; AVX2-NEXT:    store <8 x i32> [[TMP12]], ptr [[TMP14]], align 4
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX2-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -387,25 +363,25 @@ define void @test_muladd(i32* noalias nocapture %d1, i16* noalias nocapture read
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX2-NEXT:    [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]]
-; AVX2-NEXT:    [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; AVX2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP23]] to i32
-; AVX2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]]
-; AVX2-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; AVX2-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP24]] to i32
+; AVX2-NEXT:    [[TMP16:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP16]]
+; AVX2-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; AVX2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP17]] to i32
+; AVX2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP16]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
+; AVX2-NEXT:    [[CONV5:%.*]] = sext i16 [[TMP18]] to i32
 ; AVX2-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; AVX2-NEXT:    [[TMP25:%.*]] = or i64 [[TMP22]], 1
-; AVX2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP25]]
-; AVX2-NEXT:    [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; AVX2-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP26]] to i32
-; AVX2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP25]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; AVX2-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP27]] to i32
+; AVX2-NEXT:    [[TMP19:%.*]] = or i64 [[TMP16]], 1
+; AVX2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP19]]
+; AVX2-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; AVX2-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP20]] to i32
+; AVX2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP19]]
+; AVX2-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
+; AVX2-NEXT:    [[CONV16:%.*]] = sext i16 [[TMP21]] to i32
 ; AVX2-NEXT:    [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
 ; AVX2-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
-; AVX2-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    store i32 [[ADD18]], i32* [[ARRAYIDX20]], align 4
+; AVX2-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; AVX2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -425,24 +401,24 @@ for.body.preheader:
 for.body:
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
   %0 = shl nuw nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds i16, i16* %s1, i64 %0
-  %1 = load i16, i16* %arrayidx, align 2
+  %arrayidx = getelementptr inbounds i16, ptr %s1, i64 %0
+  %1 = load i16, ptr %arrayidx, align 2
   %conv = sext i16 %1 to i32
-  %arrayidx4 = getelementptr inbounds i16, i16* %s2, i64 %0
-  %2 = load i16, i16* %arrayidx4, align 2
+  %arrayidx4 = getelementptr inbounds i16, ptr %s2, i64 %0
+  %2 = load i16, ptr %arrayidx4, align 2
   %conv5 = sext i16 %2 to i32
   %mul6 = mul nsw i32 %conv5, %conv
   %3 = or i64 %0, 1
-  %arrayidx10 = getelementptr inbounds i16, i16* %s1, i64 %3
-  %4 = load i16, i16* %arrayidx10, align 2
+  %arrayidx10 = getelementptr inbounds i16, ptr %s1, i64 %3
+  %4 = load i16, ptr %arrayidx10, align 2
   %conv11 = sext i16 %4 to i32
-  %arrayidx15 = getelementptr inbounds i16, i16* %s2, i64 %3
-  %5 = load i16, i16* %arrayidx15, align 2
+  %arrayidx15 = getelementptr inbounds i16, ptr %s2, i64 %3
+  %5 = load i16, ptr %arrayidx15, align 2
   %conv16 = sext i16 %5 to i32
   %mul17 = mul nsw i32 %conv16, %conv11
   %add18 = add nsw i32 %mul17, %mul6
-  %arrayidx20 = getelementptr inbounds i32, i32* %d1, i64 %indvars.iv
-  store i32 %add18, i32* %arrayidx20, align 4
+  %arrayidx20 = getelementptr inbounds i32, ptr %d1, i64 %indvars.iv
+  store i32 %add18, ptr %arrayidx20, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.end.loopexit, label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
index c1ae5c879e6e..b77ac10925e4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
@@ -7,46 +7,122 @@ target triple = "x86_64-unknown-linux-gnu"
 %0 = type { i32 }
 %1 = type { i64 }
 
-define void @foo(i64* %p, i64* %p.last) unnamed_addr #0 {
+define void @foo(ptr %p, ptr %p.last) unnamed_addr #0 {
 ; CHECK-LABEL: @foo(
-; CHECK: vector.body:
-; CHECK:         [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP13:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP14:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> poison)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P3:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[P_LAST1:%.*]] = ptrtoint ptr [[P_LAST:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[P_LAST1]], -1024
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[P3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 10
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 1024
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 1024, i64 2048, i64 3072>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 4096, i64 5120, i64 6144, i64 7168>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 8192, i64 9216, i64 10240, i64 11264>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 12288, i64 13312, i64 14336, i64 15360>
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP5]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP6]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP7]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP8]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16384
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[P2:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[P_INC]] = getelementptr inbounds i64, ptr [[P2]], i64 128
+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P2]], align 8
+; CHECK-NEXT:    [[B:%.*]] = icmp eq ptr [[P_INC]], [[P_LAST]]
+; CHECK-NEXT:    br i1 [[B]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
 
 loop:
-  %p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ]
-  %p.inc = getelementptr inbounds i64, i64* %p2, i64 128
-  %p3 = bitcast i64* %p2 to %0**
-  %v = load %0*, %0** %p3, align 8
-  %b = icmp eq i64* %p.inc, %p.last
+  %p2 = phi ptr [ %p, %entry ], [ %p.inc, %loop ]
+  %p.inc = getelementptr inbounds i64, ptr %p2, i64 128
+  %v = load ptr, ptr %p2, align 8
+  %b = icmp eq ptr %p.inc, %p.last
   br i1 %b, label %exit, label %loop
 
 exit:
   ret void
 }
 
-define void @bar(i64* %p, i64* %p.last) unnamed_addr #0 {
+define void @bar(ptr %p, ptr %p.last) unnamed_addr #0 {
 ; CHECK-LABEL: @bar(
-; CHECK: vector.body:
-; CHECK:         [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP13:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP14:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> poison)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P3:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[P_LAST1:%.*]] = ptrtoint ptr [[P_LAST:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[P_LAST1]], -1024
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[P3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 10
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 1024
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 1024, i64 2048, i64 3072>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 4096, i64 5120, i64 6144, i64 7168>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 8192, i64 9216, i64 10240, i64 11264>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 12288, i64 13312, i64 14336, i64 15360>
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP5]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP6]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP7]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP8]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16384
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[P2:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[P_INC]] = getelementptr inbounds i64, ptr [[P2]], i64 128
+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P2]], align 8
+; CHECK-NEXT:    [[B:%.*]] = icmp eq ptr [[P_INC]], [[P_LAST]]
+; CHECK-NEXT:    br i1 [[B]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
 
 loop:
-  %p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ]
-  %p.inc = getelementptr inbounds i64, i64* %p2, i64 128
-  %p3 = bitcast i64* %p2 to %1**
-  %v = load %1*, %1** %p3, align 8
-  %b = icmp eq i64* %p.inc, %p.last
+  %p2 = phi ptr [ %p, %entry ], [ %p.inc, %loop ]
+  %p.inc = getelementptr inbounds i64, ptr %p2, i64 128
+  %v = load ptr, ptr %p2, align 8
+  %b = icmp eq ptr %p.inc, %p.last
   br i1 %b, label %exit, label %loop
 
 exit:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
index 9e65096e5fa6..60c30a5b2ae7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
@@ -3,138 +3,131 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13"
 target triple = "x86_64-unknown-linux-gnu"
 
- at jlplt_ijl_alloc_array_1d_10294_got = external dso_local local_unnamed_addr global void ()*
+ at jlplt_ijl_alloc_array_1d_10294_got = external dso_local local_unnamed_addr global ptr
 
-define {} addrspace(10)* @japi1_vect_42283({} addrspace(10)** nocapture readonly %0, i32 %1) local_unnamed_addr #0 {
+define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: @japi1_vect_42283(
 ; CHECK-NEXT:  top:
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1:%.*]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = load atomic {} addrspace(10)* ({} addrspace(10)*, i64)*, {} addrspace(10)* ({} addrspace(10)*, i64)** bitcast (void ()** @jlplt_ijl_alloc_array_1d_10294_got to {} addrspace(10)* ({} addrspace(10)*, i64)**) unordered, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call {} addrspace(10)* [[TMP3]]({} addrspace(10)* null, i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast {} addrspace(10)** [[TMP0:%.*]] to { {} addrspace(10)*, i64 } addrspace(10)**
-; CHECK-NEXT:    [[TMP6:%.*]] = load { {} addrspace(10)*, i64 } addrspace(10)*, { {} addrspace(10)*, i64 } addrspace(10)** [[TMP5]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast {} addrspace(10)* [[TMP4]] to { {} addrspace(10)*, i64 } addrspace(13)* addrspace(10)*
-; CHECK-NEXT:    [[TMP8:%.*]] = addrspacecast { {} addrspace(10)*, i64 } addrspace(13)* addrspace(10)* [[TMP7]] to { {} addrspace(10)*, i64 } addrspace(13)* addrspace(11)*
-; CHECK-NEXT:    [[TMP9:%.*]] = load { {} addrspace(10)*, i64 } addrspace(13)*, { {} addrspace(10)*, i64 } addrspace(13)* addrspace(11)* [[TMP8]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]] to i8 addrspace(13)*
-; CHECK-NEXT:    [[DOTELT:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* [[TMP6]], i64 0, i32 0
-; CHECK-NEXT:    [[DOTUNPACK:%.*]] = load {} addrspace(10)*, {} addrspace(10)* addrspace(10)* [[DOTELT]], align 8, !tbaa [[TBAA8:![0-9]+]]
-; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* [[TMP6]], i64 0, i32 1
-; CHECK-NEXT:    [[DOTUNPACK2:%.*]] = load i64, i64 addrspace(10)* [[DOTELT1]], align 8, !tbaa [[TBAA8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP11]], 28
+; CHECK-NEXT:    [[TMP3:%.*]] = load atomic ptr, ptr @jlplt_ijl_alloc_array_1d_10294_got unordered, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call ptr addrspace(10) [[TMP3]](ptr addrspace(10) null, i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(10), ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(10) [[TMP4]] to ptr addrspace(11)
+; CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(13), ptr addrspace(11) [[TMP6]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[DOTELT:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 0
+; CHECK-NEXT:    [[DOTUNPACK:%.*]] = load ptr addrspace(10), ptr addrspace(10) [[DOTELT]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 1
+; CHECK-NEXT:    [[DOTUNPACK2:%.*]] = load i64, ptr addrspace(10) [[DOTELT1]], align 8, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 28
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 0, [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8 addrspace(13)* [[TMP10]], i64 [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ult i8 addrspace(13)* [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], i64 0, i32 1
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast i64 addrspace(13)* [[SCEVGEP]] to { {} addrspace(10)*, i64 } addrspace(13)*
-; CHECK-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]])
-; CHECK-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
-; CHECK-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
-; CHECK-NEXT:    [[SCEVGEP15:%.*]] = bitcast { {} addrspace(10)*, i64 } addrspace(13)* [[SCEVGEP1]] to i8 addrspace(13)*
-; CHECK-NEXT:    [[TMP16:%.*]] = sub i64 0, [[MUL_RESULT3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, i8 addrspace(13)* [[SCEVGEP15]], i64 [[MUL_RESULT3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult i8 addrspace(13)* [[TMP17]], [[SCEVGEP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i1 [[TMP18]], [[MUL_OVERFLOW4]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or i1 [[TMP15]], [[TMP19]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(13) [[TMP7]], i64 [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult ptr addrspace(13) [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr addrspace(13) [[TMP7]], i64 8
+; CHECK-NEXT:    [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]])
+; CHECK-NEXT:    [[MUL_RESULT2:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW3:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 0, [[MUL_RESULT2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(13) [[UGLYGEP]], i64 [[MUL_RESULT2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ult ptr addrspace(13) [[TMP14]], [[UGLYGEP]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW3]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP11]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP11]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x {} addrspace(10)*> poison, {} addrspace(10)* [[DOTUNPACK]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x {} addrspace(10)*> [[BROADCAST_SPLATINSERT]], <4 x {} addrspace(10)*> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x {} addrspace(10)*> poison, {} addrspace(10)* [[DOTUNPACK]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x {} addrspace(10)*> [[BROADCAST_SPLATINSERT9]], <4 x {} addrspace(10)*> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x {} addrspace(10)*> poison, {} addrspace(10)* [[DOTUNPACK]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x {} addrspace(10)*> [[BROADCAST_SPLATINSERT11]], <4 x {} addrspace(10)*> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x {} addrspace(10)*> poison, {} addrspace(10)* [[DOTUNPACK]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x {} addrspace(10)*> [[BROADCAST_SPLATINSERT13]], <4 x {} addrspace(10)*> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP8]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT7]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT9]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT11]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT13]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT15]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT22:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT21]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[STEP_ADD6:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[STEP_ADD7:%.*]] = add <4 x i64> [[STEP_ADD6]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[VEC_IND]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD6]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD7]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10sl_s.v4p13p10sl_s(<4 x {} addrspace(10)*> [[BROADCAST_SPLAT]], <4 x {} addrspace(10)* addrspace(13)*> [[TMP21]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10sl_s.v4p13p10sl_s(<4 x {} addrspace(10)*> [[BROADCAST_SPLAT10]], <4 x {} addrspace(10)* addrspace(13)*> [[TMP22]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10sl_s.v4p13p10sl_s(<4 x {} addrspace(10)*> [[BROADCAST_SPLAT12]], <4 x {} addrspace(10)* addrspace(13)*> [[TMP23]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10sl_s.v4p13p10sl_s(<4 x {} addrspace(10)*> [[BROADCAST_SPLAT14]], <4 x {} addrspace(10)* addrspace(13)*> [[TMP24]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[VEC_IND]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD]], i32 1
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD6]], i32 1
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], <4 x i64> [[STEP_ADD7]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13i64(<4 x i64> [[BROADCAST_SPLAT16]], <4 x i64 addrspace(13)*> [[TMP25]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13i64(<4 x i64> [[BROADCAST_SPLAT18]], <4 x i64 addrspace(13)*> [[TMP26]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13i64(<4 x i64> [[BROADCAST_SPLAT20]], <4 x i64 addrspace(13)*> [[TMP27]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13i64(<4 x i64> [[BROADCAST_SPLAT22]], <4 x i64 addrspace(13)*> [[TMP28]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[STEP_ADD4:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD5:%.*]] = add <4 x i64> [[STEP_ADD4]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP18]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP19]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT10]], <4 x ptr addrspace(13)> [[TMP20]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT12]], <4 x ptr addrspace(13)> [[TMP21]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 1
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 1
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT14]], <4 x ptr addrspace(13)> [[TMP22]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT16]], <4 x ptr addrspace(13)> [[TMP23]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT18]], <4 x ptr addrspace(13)> [[TMP24]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT20]], <4 x ptr addrspace(13)> [[TMP25]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !tbaa [[TBAA10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD7]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD5]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP11]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[L44:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TOP:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label [[L26:%.*]]
 ; CHECK:       L26:
-; CHECK-NEXT:    [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP30:%.*]], [[L26]] ]
-; CHECK-NEXT:    [[DOTREPACK:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], i64 [[VALUE_PHI5]], i32 0
-; CHECK-NEXT:    store {} addrspace(10)* [[DOTUNPACK]], {} addrspace(10)* addrspace(13)* [[DOTREPACK]], align 8, !tbaa [[TBAA10]]
-; CHECK-NEXT:    [[DOTREPACK4:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* [[TMP9]], i64 [[VALUE_PHI5]], i32 1
-; CHECK-NEXT:    store i64 [[DOTUNPACK2]], i64 addrspace(13)* [[DOTREPACK4]], align 8, !tbaa [[TBAA10]]
-; CHECK-NEXT:    [[TMP30]] = add i64 [[VALUE_PHI5]], 1
+; CHECK-NEXT:    [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[L26]] ]
+; CHECK-NEXT:    [[DOTREPACK:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 0
+; CHECK-NEXT:    store ptr addrspace(10) [[DOTUNPACK]], ptr addrspace(13) [[DOTREPACK]], align 8, !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[DOTREPACK4:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 1
+; CHECK-NEXT:    store i64 [[DOTUNPACK2]], ptr addrspace(13) [[DOTREPACK4]], align 8, !tbaa [[TBAA10]]
+; CHECK-NEXT:    [[TMP27]] = add i64 [[VALUE_PHI5]], 1
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[VALUE_PHI5]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[DOTNOT]], label [[L44]], label [[L26]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       L44:
-; CHECK-NEXT:    ret {} addrspace(10)* null
+; CHECK-NEXT:    ret ptr addrspace(10) null
 ;
 top:
   %2 = sext i32 %1 to i64
-  %3 = load atomic {} addrspace(10)* ({} addrspace(10)*, i64)*, {} addrspace(10)* ({} addrspace(10)*, i64)** bitcast (void ()** @jlplt_ijl_alloc_array_1d_10294_got to {} addrspace(10)* ({} addrspace(10)*, i64)**) unordered, align 8
-  %4 = tail call {} addrspace(10)* %3({} addrspace(10)* null, i64 0)
-  %5 = bitcast {} addrspace(10)** %0 to { {} addrspace(10)*, i64 } addrspace(10)**
-  %6 = load { {} addrspace(10)*, i64 } addrspace(10)*, { {} addrspace(10)*, i64 } addrspace(10)** %5, align 8, !tbaa !0
-  %7 = bitcast {} addrspace(10)* %4 to { {} addrspace(10)*, i64 } addrspace(13)* addrspace(10)*
-  %8 = addrspacecast { {} addrspace(10)*, i64 } addrspace(13)* addrspace(10)* %7 to { {} addrspace(10)*, i64 } addrspace(13)* addrspace(11)*
-  %9 = load { {} addrspace(10)*, i64 } addrspace(13)*, { {} addrspace(10)*, i64 } addrspace(13)* addrspace(11)* %8, align 8, !tbaa !5
-  %.elt = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* %6, i64 0, i32 0
-  %.unpack = load {} addrspace(10)*, {} addrspace(10)* addrspace(10)* %.elt, align 8, !tbaa !8
-  %.elt1 = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* %6, i64 0, i32 1
-  %.unpack2 = load i64, i64 addrspace(10)* %.elt1, align 8, !tbaa !8
+  %3 = load atomic ptr addrspace(10) (ptr addrspace(10), i64)*, ptr addrspace(10) (ptr addrspace(10), i64)** bitcast (ptr @jlplt_ijl_alloc_array_1d_10294_got to ptr addrspace(10) (ptr addrspace(10), i64)**) unordered, align 8
+  %4 = tail call ptr addrspace(10) %3(ptr addrspace(10) null, i64 0)
+  %5 = load ptr addrspace(10), ptr %0, align 8, !tbaa !0
+  %6 = addrspacecast ptr addrspace(10) %4 to ptr addrspace(11)
+  %7 = load ptr addrspace(13), ptr addrspace(11) %6, align 8, !tbaa !5
+  %.elt = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) %5, i64 0, i32 0
+  %.unpack = load ptr addrspace(10), ptr addrspace(10) %.elt, align 8, !tbaa !8
+  %.elt1 = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) %5, i64 0, i32 1
+  %.unpack2 = load i64, ptr addrspace(10) %.elt1, align 8, !tbaa !8
   br label %L26
 
 L26:                                              ; preds = %L26, %top
-  %value_phi5 = phi i64 [ 0, %top ], [ %10, %L26 ]
-  %.repack = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* %9, i64 %value_phi5, i32 0
-  store {} addrspace(10)* %.unpack, {} addrspace(10)* addrspace(13)* %.repack, align 8, !tbaa !10
-  %.repack4 = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(13)* %9, i64 %value_phi5, i32 1
-  store i64 %.unpack2, i64 addrspace(13)* %.repack4, align 8, !tbaa !10
-  %10 = add i64 %value_phi5, 1
+  %value_phi5 = phi i64 [ 0, %top ], [ %8, %L26 ]
+  %.repack = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) %7, i64 %value_phi5, i32 0
+  store ptr addrspace(10) %.unpack, ptr addrspace(13) %.repack, align 8, !tbaa !10
+  %.repack4 = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) %7, i64 %value_phi5, i32 1
+  store i64 %.unpack2, ptr addrspace(13) %.repack4, align 8, !tbaa !10
+  %8 = add i64 %value_phi5, 1
   %.not = icmp eq i64 %value_phi5, %2
   br i1 %.not, label %L44, label %L26
 
 L44:                                              ; preds = %L26
-  ret {} addrspace(10)* null
+  ret ptr addrspace(10) null
 }
 
 attributes #0 = { "target-cpu"="skylake-avx512" "target-features"="+xsaves,+xsavec,+prfchw,+lzcnt,+sahf,+pku,+avx512vl,+avx512bw,+avx512cd,+clwb,+clflushopt,+adx,+avx512dq,+avx512f,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+aes,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-rdrnd,-rtm,-rdseed,-avx512ifma,-avx512pf,-avx512er,-sha,-prefetchwt1,-avx512vbmi,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-amx-tile,-amx-int8,-sse4a,-xop,-lwp,-fma4,-tbm,-mwaitx,-xsaveopt,-clzero,-wbnoinvd,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8" }

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
index aac35b578aa6..f9116c06cdd6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define float @reduction_sum_float_ieee(i32 %n, float* %array) {
+define float @reduction_sum_float_ieee(i32 %n, ptr %array) {
 ; CHECK-LABEL: @reduction_sum_float_ieee(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096
@@ -14,8 +14,8 @@ define float @reduction_sum_float_ieee(i32 %n, float* %array) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ 0, [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ 0.000000e+00, [[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[IDX]]
-; CHECK-NEXT:    [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4
+; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[IDX]]
+; CHECK-NEXT:    [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4
 ; CHECK-NEXT:    [[SUM_INC]] = fadd float [[SUM]], [[VALUE]]
 ; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
 ; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
@@ -34,8 +34,8 @@ entry:
 loop:
   %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
   %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
-  %address = getelementptr float, float* %array, i32 %idx
-  %value = load float, float* %address
+  %address = getelementptr float, ptr %array, i32 %idx
+  %value = load float, ptr %address
   %sum.inc = fadd float %sum, %value
   %idx.inc = add i32 %idx, 1
   %be.cond = icmp ne i32 %idx.inc, 4096
@@ -46,7 +46,7 @@ loop.exit:
   ret float %sum.lcssa
 }
 
-define float @reduction_sum_float_fastmath(i32 %n, float* %array) {
+define float @reduction_sum_float_fastmath(i32 %n, ptr %array) {
 ; CHECK-LABEL: @reduction_sum_float_fastmath(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096
@@ -57,43 +57,41 @@ define float @reduction_sum_float_fastmath(i32 %n, float* %array) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP7]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[IDX]]
-; CHECK-NEXT:    [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4
+; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]]
+; CHECK-NEXT:    [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4
 ; CHECK-NEXT:    [[SUM_INC]] = fadd fast float [[SUM]], [[VALUE]]
 ; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
 ; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       loop.exit.loopexit:
-; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
 ; CHECK:       loop.exit:
 ; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ]
@@ -106,8 +104,8 @@ entry:
 loop:
   %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
   %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
-  %address = getelementptr float, float* %array, i32 %idx
-  %value = load float, float* %address
+  %address = getelementptr float, ptr %array, i32 %idx
+  %value = load float, ptr %address
   %sum.inc = fadd fast float %sum, %value
   %idx.inc = add i32 %idx, 1
   %be.cond = icmp ne i32 %idx.inc, 4096
@@ -118,7 +116,7 @@ loop.exit:
   ret float %sum.lcssa
 }
 
-define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) {
+define float @reduction_sum_float_only_reassoc(i32 %n, ptr %array) {
 ; CHECK-LABEL: @reduction_sum_float_only_reassoc(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096
@@ -129,43 +127,41 @@ define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8]] = fadd reassoc <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6]] = fadd reassoc <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP7]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[IDX]]
-; CHECK-NEXT:    [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4
+; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]]
+; CHECK-NEXT:    [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4
 ; CHECK-NEXT:    [[SUM_INC]] = fadd reassoc float [[SUM]], [[VALUE]]
 ; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
 ; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP5:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loop.exit.loopexit:
-; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
 ; CHECK:       loop.exit:
 ; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ -0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ]
@@ -178,8 +174,8 @@ entry:
 loop:
   %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
   %sum = phi float [ -0.000000e+00, %entry ], [ %sum.inc, %loop ]
-  %address = getelementptr float, float* %array, i32 %idx
-  %value = load float, float* %address
+  %address = getelementptr float, ptr %array, i32 %idx
+  %value = load float, ptr %address
   %sum.inc = fadd reassoc float %sum, %value
   %idx.inc = add i32 %idx, 1
   %be.cond = icmp ne i32 %idx.inc, 4096
@@ -190,7 +186,7 @@ loop.exit:
   ret float %sum.lcssa
 }
 
-define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %array) {
+define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, ptr %array) {
 ; CHECK-LABEL: @reduction_sum_float_only_reassoc_and_contract(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096
@@ -201,43 +197,41 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %arra
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8]] = fadd reassoc contract <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6]] = fadd reassoc contract <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP7]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[IDX]]
-; CHECK-NEXT:    [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4
+; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]]
+; CHECK-NEXT:    [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4
 ; CHECK-NEXT:    [[SUM_INC]] = fadd reassoc contract float [[SUM]], [[VALUE]]
 ; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
 ; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP7:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       loop.exit.loopexit:
-; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
 ; CHECK:       loop.exit:
 ; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ -0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ]
@@ -250,8 +244,8 @@ entry:
 loop:
   %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
   %sum = phi float [ -0.000000e+00, %entry ], [ %sum.inc, %loop ]
-  %address = getelementptr float, float* %array, i32 %idx
-  %value = load float, float* %address
+  %address = getelementptr float, ptr %array, i32 %idx
+  %value = load float, ptr %address
   %sum.inc = fadd reassoc contract float %sum, %value
   %idx.inc = add i32 %idx, 1
   %be.cond = icmp ne i32 %idx.inc, 4096
@@ -265,7 +259,7 @@ loop.exit:
 ; New instructions should have the same FMF as the original code.
 ; Note that the select inherits FMF from its fcmp condition.
 
-define float @PR35538(float* nocapture readonly %a, i32 %N) #0 {
+define float @PR35538(ptr nocapture readonly %a, i32 %N) #0 {
 ; CHECK-LABEL: @PR35538(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -280,37 +274,35 @@ define float @PR35538(float* nocapture readonly %a, i32 %N) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP10]] = select <4 x i1> [[TMP8]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP11]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; CHECK-NEXT:    [[TMP8]] = select <4 x i1> [[TMP6]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP9]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf nsz ogt <4 x float> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf nsz <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call nnan ninf nsz float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf nsz ogt <4 x float> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf nsz <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP8]], <4 x float> [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call nnan ninf nsz float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -1.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -1.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[MAX_0__LCSSA:%.*]] = phi float [ [[MAX_0_:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[MAX_0__LCSSA:%.*]] = phi float [ [[MAX_0_:%.*]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[MAX_0_LCSSA:%.*]] = phi float [ -1.000000e+00, [[ENTRY:%.*]] ], [ [[MAX_0__LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -318,13 +310,13 @@ define float @PR35538(float* nocapture readonly %a, i32 %N) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[MAX_013:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX_0_]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp nnan ninf nsz oge float [[TMP14]], [[MAX_013]]
-; CHECK-NEXT:    [[MAX_0_]] = select i1 [[CMP1_INV]], float [[TMP14]], float [[MAX_013]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp nnan ninf nsz oge float [[TMP12]], [[MAX_013]]
+; CHECK-NEXT:    [[MAX_0_]] = select i1 [[CMP1_INV]], float [[TMP12]], float [[MAX_013]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 entry:
   %cmp12 = icmp sgt i32 %N, 0
@@ -341,8 +333,8 @@ for.cond.cleanup:
 for.body:
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
   %max.013 = phi float [ -1.000000e+00, %for.body.lr.ph ], [ %max.0., %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
   %cmp1.inv = fcmp nnan ninf nsz oge float %0, %max.013
   %max.0. = select i1 %cmp1.inv, float %0, float %max.013
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
@@ -352,7 +344,7 @@ for.body:
 
 ; Same as above, but this time the select already has matching FMF with its condition.
 
-define float @PR35538_more_FMF(float* nocapture readonly %a, i32 %N) #0 {
+define float @PR35538_more_FMF(ptr nocapture readonly %a, i32 %N) #0 {
 ; CHECK-LABEL: @PR35538_more_FMF(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -367,37 +359,35 @@ define float @PR35538_more_FMF(float* nocapture readonly %a, i32 %N) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP10]] = select <4 x i1> [[TMP8]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP11]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; CHECK-NEXT:    [[TMP8]] = select <4 x i1> [[TMP6]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP9]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf ogt <4 x float> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf ogt <4 x float> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP8]], <4 x float> [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -1.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -1.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[MAX_0__LCSSA:%.*]] = phi float [ [[MAX_0_:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[MAX_0__LCSSA:%.*]] = phi float [ [[MAX_0_:%.*]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[MAX_0_LCSSA:%.*]] = phi float [ -1.000000e+00, [[ENTRY:%.*]] ], [ [[MAX_0__LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -405,13 +395,13 @@ define float @PR35538_more_FMF(float* nocapture readonly %a, i32 %N) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[MAX_013:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX_0_]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp nnan ninf oge float [[TMP14]], [[MAX_013]]
-; CHECK-NEXT:    [[MAX_0_]] = select nnan ninf i1 [[CMP1_INV]], float [[TMP14]], float [[MAX_013]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp nnan ninf oge float [[TMP12]], [[MAX_013]]
+; CHECK-NEXT:    [[MAX_0_]] = select nnan ninf i1 [[CMP1_INV]], float [[TMP12]], float [[MAX_013]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
 entry:
   %cmp12 = icmp sgt i32 %N, 0
@@ -428,8 +418,8 @@ for.cond.cleanup:
 for.body:
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
   %max.013 = phi float [ -1.000000e+00, %for.body.lr.ph ], [ %max.0., %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
   %cmp1.inv = fcmp nnan ninf oge float %0, %max.013
   %max.0. = select nnan ninf i1 %cmp1.inv, float %0, float %max.013
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index d6c43e61e09b..e27e44b4fd91 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; even if it contains scalarized load (gather on AVX2)
 
 ; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 {
+define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 {
 ; CHECK-LABEL: @matrix_row_col(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64
@@ -18,7 +18,7 @@ define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -27,59 +27,58 @@ define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA1:![0-9]+]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> poison, i32 [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP21]], i32 2
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP22]], i32 3
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP23]], i32 4
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP25]], i32 6
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP26]], i32 7
-; CHECK-NEXT:    [[TMP35:%.*]] = mul nsw <8 x i32> [[TMP34]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP36:%.*]] = add <8 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP37]] = add <8 x i32> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP13]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP17]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> poison, i32 [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 2
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP21]], i32 3
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP22]], i32 4
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP23]], i32 5
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP24]], i32 6
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP25]], i32 7
+; CHECK-NEXT:    [[TMP34:%.*]] = mul nsw <8 x i32> [[TMP33]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add <8 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP36]] = add <8 x i32> [[TMP35]], [[TMP34]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
-; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP37]])
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP36]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 96
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP38]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP38]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ADD7_LCSSA]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP40]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP40]], [[TMP39]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SUM_015]], 4
 ; CHECK-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -98,11 +97,11 @@ entry:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ]
   ; first consecutive load as vector load
-  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %idxprom, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx2, align 4, !tbaa !1
+  %arrayidx2 = getelementptr inbounds [100 x i32], ptr %data, i64 %idxprom, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx2, align 4, !tbaa !1
   ; second strided load scalarized
-  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %indvars.iv, i64 %idxprom5
-  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1
+  %arrayidx6 = getelementptr inbounds [100 x i32], ptr %data, i64 %indvars.iv, i64 %idxprom5
+  %1 = load i32, ptr %arrayidx6, align 4, !tbaa !1
   %mul = mul nsw i32 %1, %0
   %add = add i32 %sum.015, 4
   %add7 = add i32 %add, %mul

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
index 103226e4af9c..d5a649697a03 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -5,7 +5,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
+define dso_local void @tail_folding_enabled(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) local_unnamed_addr #0 {
 ; CHECK-LABEL: @tail_folding_enabled(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -18,22 +18,19 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noal
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -43,16 +40,16 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noal
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -62,13 +59,13 @@ for.cond.cleanup:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
   %add = add nsw i32 %1, %0
-  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx4, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx4, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 430
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6
@@ -76,7 +73,7 @@ for.body:
 
 ; Marking function as optsize turns tail folding on, as if explicit tail folding
 ; flag was enabled.
-define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
+define dso_local void @tail_folding_disabled(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) local_unnamed_addr #0 {
 ; CHECK-LABEL: @tail_folding_disabled(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -89,22 +86,19 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -114,13 +108,13 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -133,13 +127,13 @@ for.cond.cleanup:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
   %add = add nsw i32 %1, %0
-  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx4, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx4, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 430
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
@@ -155,7 +149,7 @@ for.body:
 ; }
 ;
 
-define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) #0 {
+define i32 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, i32 %N) #0 {
 ; CHECK-LABEL: @reduction_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
@@ -172,48 +166,46 @@ define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP10]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP12]] = add <8 x i32> [[TMP11]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP12]], <8 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP6]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP8]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP10]] = add <8 x i32> [[TMP9]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP10]], <8 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP11]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SUM_0:%.*]] = phi i32 [ [[SUM_1:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDXA]], align 4
-; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDXA]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDXB]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[SUM_1]] = add nuw nsw i32 [[ADD]], [[SUM_0]]
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_1_LCSSA]]
 ;
 entry:
@@ -223,10 +215,10 @@ for.body:
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %sum.0 = phi i32 [ %sum.1, %for.body ], [ 0, %entry ]
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %arrayidxA = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidxA, align 4
-  %arrayidxB = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidxB, align 4
+  %arrayidxA = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidxA, align 4
+  %arrayidxB = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidxB, align 4
   %add = add nsw i32 %1, %0
   %sum.1 = add nuw nsw i32 %add, %sum.0
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
@@ -237,12 +229,6 @@ for.cond.cleanup:
   ret i32 %sum.1
 }
 
-; CHECK:      [[LOOP0]] = distinct !{[[LOOP0]], [[MD_IS_VEC:![0-9]+]]}
-; CHECK-NEXT: [[MD_IS_VEC:![0-9]+]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-NEXT: [[LOOP3]] = distinct !{[[LOOP3]], [[MD_RT_UNROLL_DIS:![0-9]+]], [[MD_IS_VEC]]}
-; CHECK-NEXT: [[MD_RT_UNROLL_DIS]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-NEXT: [[LOOP4]] = distinct !{[[LOOP4]], [[MD_IS_VEC]]}
-; CHECK-NEXT: [[LOOP5]] = distinct !{[[LOOP5]], [[MD_RT_UNROLL_DIS]], [[MD_IS_VEC]]}
 
 attributes #0 = { nounwind optsize uwtable "target-cpu"="core-avx2" "target-features"="+avx,+avx2" }
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
index d49047c5ff76..090d5d1371a3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -8,7 +8,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i32 @uniform_load(i32* align(4) %addr) {
+define i32 @uniform_load(ptr align(4) %addr) {
 ; CHECK-LABEL: @uniform_load(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -16,7 +16,7 @@ define i32 @uniform_load(i32* align(4) %addr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -28,7 +28,7 @@ define i32 @uniform_load(i32* align(4) %addr) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -41,7 +41,7 @@ entry:
 
 for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
-  %load = load i32, i32* %addr
+  %load = load i32, ptr %addr
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 4096
   br i1 %exitcond, label %loopexit, label %for.body
@@ -50,7 +50,7 @@ loopexit:
   ret i32 %load
 }
 
-define i32 @uniform_load2(i32* align(4) %addr) {
+define i32 @uniform_load2(ptr align(4) %addr) {
 ; CHECK-LABEL: @uniform_load2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -62,7 +62,7 @@ define i32 @uniform_load2(i32* align(4) %addr) {
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]]
@@ -86,7 +86,7 @@ define i32 @uniform_load2(i32* align(4) %addr) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
@@ -101,7 +101,7 @@ entry:
 for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %accum = phi i32 [%accum.next, %for.body], [0, %entry]
-  %load = load i32, i32* %addr
+  %load = load i32, ptr %addr
   %accum.next = add i32 %accum, %load
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 4096
@@ -111,7 +111,7 @@ loopexit:
   ret i32 %accum.next
 }
 
-define i32 @uniform_address(i32* align(4) %addr, i32 %byte_offset) {
+define i32 @uniform_address(ptr align(4) %addr, i32 %byte_offset) {
 ; CHECK-LABEL: @uniform_address(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -123,14 +123,14 @@ define i32 @uniform_address(i32* align(4) %addr, i32 %byte_offset) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -143,8 +143,8 @@ define i32 @uniform_address(i32* align(4) %addr, i32 %byte_offset) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[OFFSET:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[OFFSET]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[OFFSET]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -158,8 +158,8 @@ entry:
 for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %offset = udiv i32 %byte_offset, 4
-  %gep = getelementptr i32, i32* %addr, i32 %offset
-  %load = load i32, i32* %gep
+  %gep = getelementptr i32, ptr %addr, i32 %offset
+  %load = load i32, ptr %gep
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 4096
   br i1 %exitcond, label %loopexit, label %for.body
@@ -170,7 +170,7 @@ loopexit:
 
 
 
-define void @uniform_store_uniform_value(i32* align(4) %addr) {
+define void @uniform_store_uniform_value(ptr align(4) %addr) {
 ; CHECK-LABEL: @uniform_store_uniform_value(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -178,7 +178,7 @@ define void @uniform_store_uniform_value(i32* align(4) %addr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    store i32 0, i32* [[ADDR:%.*]], align 4
+; CHECK-NEXT:    store i32 0, ptr [[ADDR:%.*]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -190,7 +190,7 @@ define void @uniform_store_uniform_value(i32* align(4) %addr) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    store i32 0, i32* [[ADDR]], align 4
+; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -202,7 +202,7 @@ entry:
 
 for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
-  store i32 0, i32* %addr
+  store i32 0, ptr %addr
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 4096
   br i1 %exitcond, label %loopexit, label %for.body
@@ -211,15 +211,15 @@ loopexit:
   ret void
 }
 
-define void @uniform_store_varying_value(i32* align(4) %addr) {
+define void @uniform_store_varying_value(ptr align(4) %addr) {
 ; CHECK-LABEL: @uniform_store_varying_value(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 2
@@ -236,8 +236,8 @@ define void @uniform_store_varying_value(i32* align(4) %addr) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP0]], 13
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP0]], 14
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP0]], 15
-; CHECK-NEXT:    store i32 [[TMP16]], i32* [[ADDR:%.*]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 16
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
@@ -249,7 +249,7 @@ define void @uniform_store_varying_value(i32* align(4) %addr) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IV_I32:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    store i32 [[IV_I32]], i32* [[ADDR]], align 4
+; CHECK-NEXT:    store i32 [[IV_I32]], ptr [[ADDR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -262,7 +262,7 @@ entry:
 for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %iv.i32 = trunc i64 %iv to i32
-  store i32 %iv.i32, i32* %addr
+  store i32 %iv.i32, ptr %addr
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 4096
   br i1 %exitcond, label %loopexit, label %for.body
@@ -271,15 +271,15 @@ loopexit:
   ret void
 }
 
-define void @uniform_rw(i32* align(4) %addr) {
+define void @uniform_rw(ptr align(4) %addr) {
 ; CHECK-LABEL: @uniform_rw(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ADDR:%.*]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LOAD]], 1
-; CHECK-NEXT:    store i32 [[INC]], i32* [[ADDR]], align 4
+; CHECK-NEXT:    store i32 [[INC]], ptr [[ADDR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]]
@@ -291,9 +291,9 @@ entry:
 
 for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
-  %load = load i32, i32* %addr
+  %load = load i32, ptr %addr
   %inc = add i32 %load, 1
-  store i32 %inc, i32* %addr
+  store i32 %inc, ptr %addr
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 4096
   br i1 %exitcond, label %loopexit, label %for.body
@@ -302,27 +302,23 @@ loopexit:
   ret void
 }
 
-define void @uniform_copy(i32* %A, i32* %B) {
+define void @uniform_copy(ptr %A, ptr %B) {
 ; CHECK-LABEL: @uniform_copy(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-NEXT:    [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8*
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 1
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 1
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 4
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[UGLYGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[UGLYGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !alias.scope !12
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope !15, !noalias !12
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
@@ -334,8 +330,8 @@ define void @uniform_copy(i32* %A, i32* %B) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[A]], align 4
-; CHECK-NEXT:    store i32 [[LOAD]], i32* [[B]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    store i32 [[LOAD]], ptr [[B]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -347,8 +343,8 @@ entry:
 
 for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
-  %load = load i32, i32* %A
-  store i32 %load, i32* %B
+  %load = load i32, ptr %A
+  store i32 %load, ptr %B
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 4096
   br i1 %exitcond, label %loopexit, label %for.body
@@ -358,16 +354,15 @@ loopexit:
 }
 
 
-declare void @init(i32*)
+declare void @init(ptr)
 
 ;; Count the number of bits set in a bit vector -- key point of relevance is
 ;; that the byte load is uniform across 8 iterations at a time.
-define i32 @test_count_bits(i8* %test_base) {
+define i32 @test_count_bits(ptr %test_base) {
 ; CHECK-LABEL: @test_count_bits(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
-; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
-; CHECK-NEXT:    call void @init(i32* [[BASE]])
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -393,26 +388,26 @@ define i32 @test_count_bits(i8* %test_base) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = udiv i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = udiv i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = udiv i64 [[TMP7]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i8, i8* [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP25:%.*]] = load i8, i8* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[TMP18]], align 1
-; CHECK-NEXT:    [[TMP27:%.*]] = load i8, i8* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x i8> poison, i8 [[TMP24]], i32 0
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x i8> [[TMP28]], i8 [[TMP25]], i32 1
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i8> [[TMP29]], i8 [[TMP26]], i32 2
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i8> [[TMP30]], i8 [[TMP27]], i32 3
-; CHECK-NEXT:    [[TMP32:%.*]] = load i8, i8* [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, i8* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load i8, i8* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = load i8, i8* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i8> poison, i8 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i8> [[TMP36]], i8 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i8> [[TMP37]], i8 [[TMP34]], i32 2
@@ -447,8 +442,8 @@ define i32 @test_count_bits(i8* %test_base) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[BYTE:%.*]] = udiv i64 [[IV]], 8
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[BYTE]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i8, i8* [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[BYTE]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i8, ptr [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    [[BIT:%.*]] = urem i64 [[IV]], 8
 ; CHECK-NEXT:    [[BIT_TRUNC:%.*]] = trunc i64 [[BIT]] to i8
 ; CHECK-NEXT:    [[MASK:%.*]] = lshr i8 [[EARLYCND]], [[BIT_TRUNC]]
@@ -463,16 +458,15 @@ define i32 @test_count_bits(i8* %test_base) {
 ;
 entry:
   %alloca = alloca [4096 x i32]
-  %base = bitcast [4096 x i32]* %alloca to i32*
-  call void @init(i32* %base)
+  call void @init(ptr %alloca)
   br label %loop
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %accum = phi i32 [ 0, %entry ], [ %accum.next, %loop ]
   %iv.next = add i64 %iv, 1
   %byte = udiv i64 %iv, 8
-  %test_addr = getelementptr inbounds i8, i8* %test_base, i64 %byte
-  %earlycnd = load i8, i8* %test_addr
+  %test_addr = getelementptr inbounds i8, ptr %test_base, i64 %byte
+  %earlycnd = load i8, ptr %test_addr
   %bit = urem i64 %iv, 8
   %bit.trunc = trunc i64 %bit to i8
   %mask = lshr i8 %earlycnd, %bit.trunc
@@ -501,7 +495,7 @@ define i32 @uniform_load_global() {
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @GAddr, align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]]
@@ -525,7 +519,7 @@ define i32 @uniform_load_global() {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr @GAddr, align 4
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
@@ -540,7 +534,7 @@ entry:
 for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %accum = phi i32 [%accum.next, %for.body], [0, %entry]
-  %load = load i32, i32* @GAddr
+  %load = load i32, ptr @GAddr
   %accum.next = add i32 %accum, %load
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 4096
@@ -563,7 +557,7 @@ define i32 @uniform_load_constexpr() {
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]]
@@ -587,7 +581,7 @@ define i32 @uniform_load_constexpr() {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
@@ -602,7 +596,7 @@ entry:
 for.body:                                         ; preds = %for.body, %entry
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %accum = phi i32 [ %accum.next, %for.body ], [ 0, %entry ]
-  %load = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+  %load = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4
   %accum.next = add i32 %accum, %load
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 4096

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 7cfaedef3248..64378d501e59 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux"
 ;
 ; The source code for the test:
 ;
-; void foo(float* restrict A, float* restrict B)
+; void foo(ptr restrict A, ptr restrict B)
 ; {
 ;     for (int i = 0; i < 20; ++i) A[i] += B[i];
 ; }
@@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux"
 ;
 ; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata.
 ;
-define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: @vectorized(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -25,20 +25,17 @@ define void @vectorized(float* noalias nocapture %A, float* noalias nocapture re
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -47,15 +44,15 @@ define void @vectorized(float* noalias nocapture %A, float* noalias nocapture re
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP4:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -64,12 +61,12 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !11
-  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !11
+  %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4, !llvm.access.group !11
+  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4, !llvm.access.group !11
   %add = fadd fast float %0, %1
-  store float %add, float* %arrayidx2, align 4, !llvm.access.group !11
+  store float %add, ptr %arrayidx2, align 4, !llvm.access.group !11
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 20
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
@@ -86,7 +83,7 @@ for.end:
 ; This loop will be vectorized as the trip count is below the threshold but no
 ; scalar iterations are needed thanks to folding its tail.
 ;
-define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: @vectorized1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -97,22 +94,19 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group !6
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group !6
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group !6
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -120,15 +114,15 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -137,12 +131,12 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
-  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13
+  %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4, !llvm.access.group !13
   %add = fadd fast float %0, %1
-  store float %add, float* %arrayidx2, align 4, !llvm.access.group !13
+  store float %add, ptr %arrayidx2, align 4, !llvm.access.group !13
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 20
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
@@ -158,7 +152,7 @@ for.end:
 ; This loop will be vectorized as the trip count is below the threshold but no
 ; scalar iterations are needed.
 ;
-define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: @vectorized2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -167,20 +161,17 @@ define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !6
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !6
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -189,15 +180,15 @@ define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -206,12 +197,12 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
-  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13
+  %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4, !llvm.access.group !13
   %add = fadd fast float %0, %1
-  store float %add, float* %arrayidx2, align 4, !llvm.access.group !13
+  store float %add, ptr %arrayidx2, align 4, !llvm.access.group !13
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 16
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
index c0e375481dc1..a63a7915346e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; can be vectorized with predication, scalarizing it would cause its pointer
 ; operand to become non-uniform.
 ;
-define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) {
+define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; CHECK-LABEL: @predicated_sdiv_masked_load(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i32 0
@@ -17,54 +17,52 @@ define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[PRED_SDIV_CONTINUE2]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_SDIV_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP6]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP4]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; CHECK:       pred.sdiv.if:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i32 [[TMP8]], [[X:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i32 [[TMP6]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; CHECK:       pred.sdiv.continue:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2]]
 ; CHECK:       pred.sdiv.if1:
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = sdiv i32 [[TMP13]], [[X]]
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[X]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
 ; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE2]]
 ; CHECK:       pred.sdiv.continue2:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i32> [ [[TMP11]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_SDIV_IF1]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <2 x i32> [[TMP16]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP17]], <2 x i32> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP19]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_SDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <2 x i32> [[TMP14]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP15]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP17]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP19]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 10000, [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[R:%.*]] = phi i32 [ [[T7:%.*]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
-; CHECK-NEXT:    [[T1:%.*]] = load i32, i32* [[T0]], align 4
+; CHECK-NEXT:    [[R:%.*]] = phi i32 [ [[T7:%.*]], [[FOR_INC]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[T1:%.*]] = load i32, ptr [[T0]], align 4
 ; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[T3:%.*]] = load i32, i32* [[T2]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[T3:%.*]] = load i32, ptr [[T2]], align 4
 ; CHECK-NEXT:    [[T4:%.*]] = sdiv i32 [[T3]], [[X]]
 ; CHECK-NEXT:    [[T5:%.*]] = add nsw i32 [[T4]], [[T1]]
 ; CHECK-NEXT:    br label [[FOR_INC]]
@@ -75,7 +73,7 @@ define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[T8]]
 ;
 ; SINK-GATHER-LABEL: @predicated_sdiv_masked_load(
@@ -87,112 +85,110 @@ define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-GATHER:       vector.body:
 ; SINK-GATHER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE14:%.*]] ]
-; SINK-GATHER-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_SDIV_CONTINUE14]] ]
+; SINK-GATHER-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_SDIV_CONTINUE14]] ]
 ; SINK-GATHER-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SINK-GATHER-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; SINK-GATHER-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; SINK-GATHER-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
-; SINK-GATHER-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4
-; SINK-GATHER-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP0]]
-; SINK-GATHER-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0
-; SINK-GATHER-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; SINK-GATHER-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP6]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison)
-; SINK-GATHER-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0
-; SINK-GATHER-NEXT:    br i1 [[TMP7]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
+; SINK-GATHER-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; SINK-GATHER-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; SINK-GATHER-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
+; SINK-GATHER-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]]
+; SINK-GATHER-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; SINK-GATHER-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison)
+; SINK-GATHER-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0
+; SINK-GATHER-NEXT:    br i1 [[TMP5]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; SINK-GATHER:       pred.sdiv.if:
-; SINK-GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 0
-; SINK-GATHER-NEXT:    [[TMP9:%.*]] = sdiv i32 [[TMP8]], [[X:%.*]]
-; SINK-GATHER-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[TMP9]], i32 0
+; SINK-GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 0
+; SINK-GATHER-NEXT:    [[TMP7:%.*]] = sdiv i32 [[TMP6]], [[X:%.*]]
+; SINK-GATHER-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[TMP7]], i32 0
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; SINK-GATHER:       pred.sdiv.continue:
-; SINK-GATHER-NEXT:    [[TMP11:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ]
-; SINK-GATHER-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1
-; SINK-GATHER-NEXT:    br i1 [[TMP12]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]]
+; SINK-GATHER-NEXT:    [[TMP9:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SDIV_IF]] ]
+; SINK-GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1
+; SINK-GATHER-NEXT:    br i1 [[TMP10]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]]
 ; SINK-GATHER:       pred.sdiv.if1:
-; SINK-GATHER-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 1
-; SINK-GATHER-NEXT:    [[TMP14:%.*]] = sdiv i32 [[TMP13]], [[X]]
-; SINK-GATHER-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP14]], i32 1
+; SINK-GATHER-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 1
+; SINK-GATHER-NEXT:    [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[X]]
+; SINK-GATHER-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP12]], i32 1
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE2]]
 ; SINK-GATHER:       pred.sdiv.continue2:
-; SINK-GATHER-NEXT:    [[TMP16:%.*]] = phi <8 x i32> [ [[TMP11]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_SDIV_IF1]] ]
-; SINK-GATHER-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2
-; SINK-GATHER-NEXT:    br i1 [[TMP17]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]]
+; SINK-GATHER-NEXT:    [[TMP14:%.*]] = phi <8 x i32> [ [[TMP9]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_SDIV_IF1]] ]
+; SINK-GATHER-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2
+; SINK-GATHER-NEXT:    br i1 [[TMP15]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]]
 ; SINK-GATHER:       pred.sdiv.if3:
-; SINK-GATHER-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 2
-; SINK-GATHER-NEXT:    [[TMP19:%.*]] = sdiv i32 [[TMP18]], [[X]]
-; SINK-GATHER-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP19]], i32 2
+; SINK-GATHER-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 2
+; SINK-GATHER-NEXT:    [[TMP17:%.*]] = sdiv i32 [[TMP16]], [[X]]
+; SINK-GATHER-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP17]], i32 2
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE4]]
 ; SINK-GATHER:       pred.sdiv.continue4:
-; SINK-GATHER-NEXT:    [[TMP21:%.*]] = phi <8 x i32> [ [[TMP16]], [[PRED_SDIV_CONTINUE2]] ], [ [[TMP20]], [[PRED_SDIV_IF3]] ]
-; SINK-GATHER-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3
-; SINK-GATHER-NEXT:    br i1 [[TMP22]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]]
+; SINK-GATHER-NEXT:    [[TMP19:%.*]] = phi <8 x i32> [ [[TMP14]], [[PRED_SDIV_CONTINUE2]] ], [ [[TMP18]], [[PRED_SDIV_IF3]] ]
+; SINK-GATHER-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3
+; SINK-GATHER-NEXT:    br i1 [[TMP20]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]]
 ; SINK-GATHER:       pred.sdiv.if5:
-; SINK-GATHER-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 3
-; SINK-GATHER-NEXT:    [[TMP24:%.*]] = sdiv i32 [[TMP23]], [[X]]
-; SINK-GATHER-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP24]], i32 3
+; SINK-GATHER-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 3
+; SINK-GATHER-NEXT:    [[TMP22:%.*]] = sdiv i32 [[TMP21]], [[X]]
+; SINK-GATHER-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP22]], i32 3
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE6]]
 ; SINK-GATHER:       pred.sdiv.continue6:
-; SINK-GATHER-NEXT:    [[TMP26:%.*]] = phi <8 x i32> [ [[TMP21]], [[PRED_SDIV_CONTINUE4]] ], [ [[TMP25]], [[PRED_SDIV_IF5]] ]
-; SINK-GATHER-NEXT:    [[TMP27:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4
-; SINK-GATHER-NEXT:    br i1 [[TMP27]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]]
+; SINK-GATHER-NEXT:    [[TMP24:%.*]] = phi <8 x i32> [ [[TMP19]], [[PRED_SDIV_CONTINUE4]] ], [ [[TMP23]], [[PRED_SDIV_IF5]] ]
+; SINK-GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4
+; SINK-GATHER-NEXT:    br i1 [[TMP25]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]]
 ; SINK-GATHER:       pred.sdiv.if7:
-; SINK-GATHER-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 4
-; SINK-GATHER-NEXT:    [[TMP29:%.*]] = sdiv i32 [[TMP28]], [[X]]
-; SINK-GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP29]], i32 4
+; SINK-GATHER-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 4
+; SINK-GATHER-NEXT:    [[TMP27:%.*]] = sdiv i32 [[TMP26]], [[X]]
+; SINK-GATHER-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP27]], i32 4
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE8]]
 ; SINK-GATHER:       pred.sdiv.continue8:
-; SINK-GATHER-NEXT:    [[TMP31:%.*]] = phi <8 x i32> [ [[TMP26]], [[PRED_SDIV_CONTINUE6]] ], [ [[TMP30]], [[PRED_SDIV_IF7]] ]
-; SINK-GATHER-NEXT:    [[TMP32:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5
-; SINK-GATHER-NEXT:    br i1 [[TMP32]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]]
+; SINK-GATHER-NEXT:    [[TMP29:%.*]] = phi <8 x i32> [ [[TMP24]], [[PRED_SDIV_CONTINUE6]] ], [ [[TMP28]], [[PRED_SDIV_IF7]] ]
+; SINK-GATHER-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5
+; SINK-GATHER-NEXT:    br i1 [[TMP30]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]]
 ; SINK-GATHER:       pred.sdiv.if9:
-; SINK-GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 5
-; SINK-GATHER-NEXT:    [[TMP34:%.*]] = sdiv i32 [[TMP33]], [[X]]
-; SINK-GATHER-NEXT:    [[TMP35:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP34]], i32 5
+; SINK-GATHER-NEXT:    [[TMP31:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 5
+; SINK-GATHER-NEXT:    [[TMP32:%.*]] = sdiv i32 [[TMP31]], [[X]]
+; SINK-GATHER-NEXT:    [[TMP33:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP32]], i32 5
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE10]]
 ; SINK-GATHER:       pred.sdiv.continue10:
-; SINK-GATHER-NEXT:    [[TMP36:%.*]] = phi <8 x i32> [ [[TMP31]], [[PRED_SDIV_CONTINUE8]] ], [ [[TMP35]], [[PRED_SDIV_IF9]] ]
-; SINK-GATHER-NEXT:    [[TMP37:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6
-; SINK-GATHER-NEXT:    br i1 [[TMP37]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]]
+; SINK-GATHER-NEXT:    [[TMP34:%.*]] = phi <8 x i32> [ [[TMP29]], [[PRED_SDIV_CONTINUE8]] ], [ [[TMP33]], [[PRED_SDIV_IF9]] ]
+; SINK-GATHER-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6
+; SINK-GATHER-NEXT:    br i1 [[TMP35]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]]
 ; SINK-GATHER:       pred.sdiv.if11:
-; SINK-GATHER-NEXT:    [[TMP38:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 6
-; SINK-GATHER-NEXT:    [[TMP39:%.*]] = sdiv i32 [[TMP38]], [[X]]
-; SINK-GATHER-NEXT:    [[TMP40:%.*]] = insertelement <8 x i32> [[TMP36]], i32 [[TMP39]], i32 6
+; SINK-GATHER-NEXT:    [[TMP36:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 6
+; SINK-GATHER-NEXT:    [[TMP37:%.*]] = sdiv i32 [[TMP36]], [[X]]
+; SINK-GATHER-NEXT:    [[TMP38:%.*]] = insertelement <8 x i32> [[TMP34]], i32 [[TMP37]], i32 6
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE12]]
 ; SINK-GATHER:       pred.sdiv.continue12:
-; SINK-GATHER-NEXT:    [[TMP41:%.*]] = phi <8 x i32> [ [[TMP36]], [[PRED_SDIV_CONTINUE10]] ], [ [[TMP40]], [[PRED_SDIV_IF11]] ]
-; SINK-GATHER-NEXT:    [[TMP42:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7
-; SINK-GATHER-NEXT:    br i1 [[TMP42]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14]]
+; SINK-GATHER-NEXT:    [[TMP39:%.*]] = phi <8 x i32> [ [[TMP34]], [[PRED_SDIV_CONTINUE10]] ], [ [[TMP38]], [[PRED_SDIV_IF11]] ]
+; SINK-GATHER-NEXT:    [[TMP40:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7
+; SINK-GATHER-NEXT:    br i1 [[TMP40]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14]]
 ; SINK-GATHER:       pred.sdiv.if13:
-; SINK-GATHER-NEXT:    [[TMP43:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 7
-; SINK-GATHER-NEXT:    [[TMP44:%.*]] = sdiv i32 [[TMP43]], [[X]]
-; SINK-GATHER-NEXT:    [[TMP45:%.*]] = insertelement <8 x i32> [[TMP41]], i32 [[TMP44]], i32 7
+; SINK-GATHER-NEXT:    [[TMP41:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 7
+; SINK-GATHER-NEXT:    [[TMP42:%.*]] = sdiv i32 [[TMP41]], [[X]]
+; SINK-GATHER-NEXT:    [[TMP43:%.*]] = insertelement <8 x i32> [[TMP39]], i32 [[TMP42]], i32 7
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE14]]
 ; SINK-GATHER:       pred.sdiv.continue14:
-; SINK-GATHER-NEXT:    [[TMP46:%.*]] = phi <8 x i32> [ [[TMP41]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP45]], [[PRED_SDIV_IF13]] ]
-; SINK-GATHER-NEXT:    [[TMP47:%.*]] = add nsw <8 x i32> [[TMP46]], [[WIDE_LOAD]]
-; SINK-GATHER-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; SINK-GATHER-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP47]], <8 x i32> [[WIDE_LOAD]]
-; SINK-GATHER-NEXT:    [[TMP49]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]]
+; SINK-GATHER-NEXT:    [[TMP44:%.*]] = phi <8 x i32> [ [[TMP39]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP43]], [[PRED_SDIV_IF13]] ]
+; SINK-GATHER-NEXT:    [[TMP45:%.*]] = add nsw <8 x i32> [[TMP44]], [[WIDE_LOAD]]
+; SINK-GATHER-NEXT:    [[TMP46:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; SINK-GATHER-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP45]], <8 x i32> [[WIDE_LOAD]]
+; SINK-GATHER-NEXT:    [[TMP47]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; SINK-GATHER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SINK-GATHER-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; SINK-GATHER-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SINK-GATHER-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; SINK-GATHER-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SINK-GATHER:       middle.block:
-; SINK-GATHER-NEXT:    [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP49]])
+; SINK-GATHER-NEXT:    [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP47]])
 ; SINK-GATHER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; SINK-GATHER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-GATHER:       scalar.ph:
 ; SINK-GATHER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-GATHER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
+; SINK-GATHER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ]
 ; SINK-GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-GATHER:       for.body:
 ; SINK-GATHER-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; SINK-GATHER-NEXT:    [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[T7:%.*]], [[FOR_INC]] ]
-; SINK-GATHER-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
-; SINK-GATHER-NEXT:    [[T1:%.*]] = load i32, i32* [[T0]], align 4
+; SINK-GATHER-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; SINK-GATHER-NEXT:    [[T1:%.*]] = load i32, ptr [[T0]], align 4
 ; SINK-GATHER-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; SINK-GATHER:       if.then:
-; SINK-GATHER-NEXT:    [[T2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; SINK-GATHER-NEXT:    [[T3:%.*]] = load i32, i32* [[T2]], align 4
+; SINK-GATHER-NEXT:    [[T2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; SINK-GATHER-NEXT:    [[T3:%.*]] = load i32, ptr [[T2]], align 4
 ; SINK-GATHER-NEXT:    [[T4:%.*]] = sdiv i32 [[T3]], [[X]]
 ; SINK-GATHER-NEXT:    [[T5:%.*]] = add nsw i32 [[T4]], [[T1]]
 ; SINK-GATHER-NEXT:    br label [[FOR_INC]]
@@ -203,7 +199,7 @@ define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 10000
 ; SINK-GATHER-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; SINK-GATHER:       for.end:
-; SINK-GATHER-NEXT:    [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
+; SINK-GATHER-NEXT:    [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ]
 ; SINK-GATHER-NEXT:    ret i32 [[T8]]
 ;
 entry:
@@ -212,13 +208,13 @@ entry:
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
   %r = phi i32 [ 0, %entry ], [ %t7, %for.inc ]
-  %t0 = getelementptr inbounds i32, i32* %a, i64 %i
-  %t1 = load i32, i32* %t0, align 4
+  %t0 = getelementptr inbounds i32, ptr %a, i64 %i
+  %t1 = load i32, ptr %t0, align 4
   br i1 %c, label %if.then, label %for.inc
 
 if.then:
-  %t2 = getelementptr inbounds i32, i32* %b, i64 %i
-  %t3 = load i32, i32* %t2, align 4
+  %t2 = getelementptr inbounds i32, ptr %b, i64 %i
+  %t3 = load i32, ptr %t2, align 4
   %t4 = sdiv i32 %t3, %x
   %t5 = add nsw i32 %t4, %t1
   br label %for.inc
@@ -238,7 +234,7 @@ for.end:
 ; This test ensures that a load, which would have been widened otherwise is
 ; instead scalarized if Cost-Model so decided as part of its
 ; sink-scalar-operands optimization for predicated instructions.
-define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
+define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-LABEL: @scalarize_and_sink_gather(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
@@ -261,8 +257,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[X]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
@@ -273,8 +269,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.if1:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP13]], i32 1
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
@@ -302,8 +298,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    [[I7:%.*]] = mul i64 [[I]], 777
 ; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I7]]
-; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* [[T0]], align 4
+; CHECK-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I7]]
+; CHECK-NEXT:    [[T2:%.*]] = load i32, ptr [[T0]], align 4
 ; CHECK-NEXT:    [[T4:%.*]] = udiv i32 [[T2]], [[X]]
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
@@ -338,8 +334,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; SINK-GATHER:       pred.udiv.if:
 ; SINK-GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; SINK-GATHER-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP2]]
-; SINK-GATHER-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+; SINK-GATHER-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]]
+; SINK-GATHER-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 ; SINK-GATHER-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i32 0
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE]]
@@ -350,8 +346,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2:%.*]]
 ; SINK-GATHER:       pred.udiv.if1:
 ; SINK-GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1
-; SINK-GATHER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
-; SINK-GATHER-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+; SINK-GATHER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
+; SINK-GATHER-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 ; SINK-GATHER-NEXT:    [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP13]], i32 1
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
@@ -362,8 +358,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br i1 [[TMP17]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
 ; SINK-GATHER:       pred.udiv.if3:
 ; SINK-GATHER-NEXT:    [[TMP18:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2
-; SINK-GATHER-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]]
-; SINK-GATHER-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
+; SINK-GATHER-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
+; SINK-GATHER-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 ; SINK-GATHER-NEXT:    [[TMP21:%.*]] = udiv i32 [[TMP20]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP21]], i32 2
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
@@ -374,8 +370,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br i1 [[TMP25]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
 ; SINK-GATHER:       pred.udiv.if5:
 ; SINK-GATHER-NEXT:    [[TMP26:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3
-; SINK-GATHER-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP26]]
-; SINK-GATHER-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4
+; SINK-GATHER-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]]
+; SINK-GATHER-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
 ; SINK-GATHER-NEXT:    [[TMP29:%.*]] = udiv i32 [[TMP28]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP29]], i32 3
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
@@ -386,8 +382,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br i1 [[TMP33]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
 ; SINK-GATHER:       pred.udiv.if7:
 ; SINK-GATHER-NEXT:    [[TMP34:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4
-; SINK-GATHER-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP34]]
-; SINK-GATHER-NEXT:    [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4
+; SINK-GATHER-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP34]]
+; SINK-GATHER-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
 ; SINK-GATHER-NEXT:    [[TMP37:%.*]] = udiv i32 [[TMP36]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP38:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP37]], i32 4
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
@@ -398,8 +394,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br i1 [[TMP41]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
 ; SINK-GATHER:       pred.udiv.if9:
 ; SINK-GATHER-NEXT:    [[TMP42:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5
-; SINK-GATHER-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP42]]
-; SINK-GATHER-NEXT:    [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4
+; SINK-GATHER-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP42]]
+; SINK-GATHER-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
 ; SINK-GATHER-NEXT:    [[TMP45:%.*]] = udiv i32 [[TMP44]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP46:%.*]] = insertelement <8 x i32> [[TMP40]], i32 [[TMP45]], i32 5
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
@@ -410,8 +406,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br i1 [[TMP49]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
 ; SINK-GATHER:       pred.udiv.if11:
 ; SINK-GATHER-NEXT:    [[TMP50:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6
-; SINK-GATHER-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP50]]
-; SINK-GATHER-NEXT:    [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4
+; SINK-GATHER-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP50]]
+; SINK-GATHER-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4
 ; SINK-GATHER-NEXT:    [[TMP53:%.*]] = udiv i32 [[TMP52]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP54:%.*]] = insertelement <8 x i32> [[TMP48]], i32 [[TMP53]], i32 6
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
@@ -422,8 +418,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br i1 [[TMP57]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14]]
 ; SINK-GATHER:       pred.udiv.if13:
 ; SINK-GATHER-NEXT:    [[TMP58:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7
-; SINK-GATHER-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP58]]
-; SINK-GATHER-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP59]], align 4
+; SINK-GATHER-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP58]]
+; SINK-GATHER-NEXT:    [[TMP60:%.*]] = load i32, ptr [[TMP59]], align 4
 ; SINK-GATHER-NEXT:    [[TMP61:%.*]] = udiv i32 [[TMP60]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP62:%.*]] = insertelement <8 x i32> [[TMP56]], i32 [[TMP61]], i32 7
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
@@ -451,8 +447,8 @@ define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[I7:%.*]] = mul i64 [[I]], 777
 ; SINK-GATHER-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; SINK-GATHER:       if.then:
-; SINK-GATHER-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I7]]
-; SINK-GATHER-NEXT:    [[T2:%.*]] = load i32, i32* [[T0]], align 4
+; SINK-GATHER-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I7]]
+; SINK-GATHER-NEXT:    [[T2:%.*]] = load i32, ptr [[T0]], align 4
 ; SINK-GATHER-NEXT:    [[T4:%.*]] = udiv i32 [[T2]], [[X]]
 ; SINK-GATHER-NEXT:    br label [[FOR_INC]]
 ; SINK-GATHER:       for.inc:
@@ -475,8 +471,8 @@ for.body:
   br i1 %c, label %if.then, label %for.inc
 
 if.then:
-  %t0 = getelementptr inbounds i32, i32* %a, i64 %i7
-  %t2 = load i32, i32* %t0, align 4
+  %t0 = getelementptr inbounds i32, ptr %a, i64 %i7
+  %t2 = load i32, ptr %t0, align 4
   %t4 = udiv i32 %t2, %x
   br label %for.inc
 

diff  --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
index 5b73e6222efc..cfdb2c81428d 100644
--- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
+++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; Make sure the loop is vectorized and unrolled under -Os without folding its
 ; tail based on its trip-count being provably divisible by chosen VFxIC.
 
-define dso_local void @constTC(i32* noalias nocapture %A) optsize {
+define dso_local void @constTC(ptr noalias nocapture %A) optsize {
 ; CHECK-LABEL: @constTC(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -17,21 +17,18 @@ define dso_local void @constTC(i32* noalias nocapture %A) optsize {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, <2 x i32>* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, <2 x i32>* [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, <2 x i32>* [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
+; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 4
+; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1800, 1800
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -40,11 +37,11 @@ define dso_local void @constTC(i32* noalias nocapture %A) optsize {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]]
-; CHECK-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]]
+; CHECK-NEXT:    store i32 13, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 1800
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -53,8 +50,8 @@ entry:
 
 loop:
   %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv
-  store i32 13, i32* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %riv
+  store i32 13, ptr %arrayidx, align 1
   %rivPlus1 = add nuw nsw i32 %riv, 1
   %cond = icmp eq i32 %rivPlus1, 1800
   br i1 %cond, label %exit, label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
index baeab28e5ad8..44066a9ca879 100644
--- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
+++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; Make sure the loop is vectorized under -Os without folding its tail based on
 ; its trip-count's lower bits known to be zero.
 
-define dso_local void @alignTC(i32* noalias nocapture %A, i32 %n) optsize {
+define dso_local void @alignTC(ptr noalias nocapture %A, i32 %n) optsize {
 ; CHECK-LABEL: @alignTC(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALIGNEDTC:%.*]] = and i32 [[N:%.*]], -8
@@ -19,13 +19,12 @@ define dso_local void @alignTC(i32* noalias nocapture %A, i32 %n) optsize {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, <4 x i32>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[ALIGNEDTC]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -34,11 +33,11 @@ define dso_local void @alignTC(i32* noalias nocapture %A, i32 %n) optsize {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]]
-; CHECK-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]]
+; CHECK-NEXT:    store i32 13, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[ALIGNEDTC]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -48,8 +47,8 @@ entry:
 
 loop:
   %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv
-  store i32 13, i32* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %riv
+  store i32 13, ptr %arrayidx, align 1
   %rivPlus1 = add nuw nsw i32 %riv, 1
   %cond = icmp eq i32 %rivPlus1, %alignedTC
   br i1 %cond, label %exit, label %loop
@@ -61,7 +60,7 @@ exit:
 ; Make sure the loop is vectorized under -Os without folding its tail based on
 ; its trip-count's lower bits assumed to be zero.
 
-define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q) optsize {
+define dso_local void @assumeAlignedTC(ptr noalias nocapture %A, i32 %p, i32 %q) optsize {
 ; CHECK-LABEL: @assumeAlignedTC(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[P:%.*]], 3
@@ -85,13 +84,12 @@ define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, <4 x i32>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -100,11 +98,11 @@ define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]]
-; CHECK-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]]
+; CHECK-NEXT:    store i32 13, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP5:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -126,8 +124,8 @@ guarded:
 
 loop:
   %riv = phi i32 [ 0, %guarded ], [ %rivPlus1, %loop ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv
-  store i32 13, i32* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %riv
+  store i32 13, ptr %arrayidx, align 1
   %rivPlus1 = add nuw nsw i32 %riv, 1
   %cond = icmp eq i32 %rivPlus1, %n
   br i1 %cond, label %exit, label %loop
@@ -139,7 +137,7 @@ exit:
 ; Make sure the loop's tail is folded when vectorized under -Os based on its trip-count's
 ; not being provably divisible by chosen VF.
 
-define dso_local void @cannotProveAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q) optsize {
+define dso_local void @cannotProveAlignedTC(ptr noalias nocapture %A, i32 %p, i32 %q) optsize {
 ; CHECK-LABEL: @cannotProveAlignedTC(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[P:%.*]], 3
@@ -171,38 +169,38 @@ define dso_local void @cannotProveAlignedTC(i32* noalias nocapture %A, i32 %p, i
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP2]]
-; CHECK-NEXT:    store i32 13, i32* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    store i32 13, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
 ; CHECK:       pred.store.if1:
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP5]]
-; CHECK-NEXT:    store i32 13, i32* [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP5]]
+; CHECK-NEXT:    store i32 13, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; CHECK:       pred.store.if3:
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP8]]
-; CHECK-NEXT:    store i32 13, i32* [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP8]]
+; CHECK-NEXT:    store i32 13, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; CHECK:       pred.store.continue4:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.if5:
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP11]]
-; CHECK-NEXT:    store i32 13, i32* [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]]
+; CHECK-NEXT:    store i32 13, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.continue6:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -210,11 +208,11 @@ define dso_local void @cannotProveAlignedTC(i32* noalias nocapture %A, i32 %p, i
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]]
-; CHECK-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]]
+; CHECK-NEXT:    store i32 13, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP7:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -236,8 +234,8 @@ guarded:
 
 loop:
   %riv = phi i32 [ 0, %guarded ], [ %rivPlus1, %loop ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv
-  store i32 13, i32* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %riv
+  store i32 13, ptr %arrayidx, align 1
   %rivPlus1 = add nuw nsw i32 %riv, 1
   %cond = icmp eq i32 %rivPlus1, %n
   br i1 %cond, label %exit, label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
index 0ca133d9e25a..57e3fa87c3cf 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
@@ -4,7 +4,7 @@
 ;
 ; Integer reduction with a start value of 5
 ;
-define i64 @int_reduction_add(i64* %a, i64 %N) {
+define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-LABEL: @int_reduction_add(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
@@ -18,18 +18,17 @@ define i64 @int_reduction_add(i64* %a, i64 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 5, i64 0, i64 0, i64 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 5, i64 0, i64 0, i64 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <4 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
@@ -37,43 +36,42 @@ define i64 @int_reduction_add(i64* %a, i64 %N) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP5]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP7]], [[VEC_EPILOG_PH]] ], [ [[TMP12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <4 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i64>, <4 x i64>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP12]] = add <4 x i64> [[WIDE_LOAD7]], [[VEC_PHI6]]
-; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP6]], [[VEC_EPILOG_PH]] ], [ [[TMP10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10]] = add <4 x i64> [[WIDE_LOAD7]], [[VEC_PHI6]]
+; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP12]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP10]])
 ; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N4]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi i64 [ 5, [[ITER_CHECK]] ], [ [[TMP5]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add i64 [[TMP15]], [[SUM]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add i64 [[TMP13]], [[SUM]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[TMP12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[ADD_LCSSA]]
 ;
 entry:
@@ -82,8 +80,8 @@ entry:
 for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %sum = phi i64 [ 5, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %iv
-  %0 = load i64, i64* %arrayidx
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
+  %0 = load i64, ptr %arrayidx
   %add = add i64 %0, %sum
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %N
@@ -96,7 +94,7 @@ for.end:
 ;
 ; Floating point max reduction
 ;
-define float @fp_reduction_max(float* noalias %a, i64 %N) {
+define float @fp_reduction_max(ptr noalias %a, i64 %N) {
 ; CHECK-LABEL: @fp_reduction_max(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
@@ -110,19 +108,18 @@ define float @fp_reduction_max(float* noalias %a, i64 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
@@ -130,7 +127,7 @@ define float @fp_reduction_max(float* noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
@@ -138,38 +135,37 @@ define float @fp_reduction_max(float* noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x float> [[MINMAX_IDENT_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI6]], [[WIDE_LOAD7]]
-; CHECK-NEXT:    [[TMP13]] = select <4 x i1> [[TMP12]], <4 x float> [[VEC_PHI6]], <4 x float> [[WIDE_LOAD7]]
-; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI6]], [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[TMP11]] = select <4 x i1> [[TMP10]], <4 x float> [[VEC_PHI6]], <4 x float> [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP11]])
 ; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N4]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi float [ 0.000000e+00, [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi float [ 0.000000e+00, [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[V0:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[C0:%.*]] = fcmp fast ogt float [[RESULT_08]], [[L0]]
 ; CHECK-NEXT:    [[V0]] = select fast i1 [[C0]], float [[RESULT_08]], float [[L0]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[V0_LCSSA:%.*]] = phi float [ [[V0]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[V0_LCSSA:%.*]] = phi float [ [[V0]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[V0_LCSSA]]
 ;
 entry:
@@ -178,8 +174,8 @@ entry:
 for.body:
   %indvars.iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %result.08 = phi float [ %v0, %for.body ], [ 0.000000e+00, %entry ]
-  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
-  %l0 = load float, float* %arrayidx
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv
+  %l0 = load float, ptr %arrayidx
   %c0 = fcmp fast ogt float %result.08, %l0
   %v0 = select fast i1 %c0, float %result.08, float %l0
   %iv.next = add i64 %indvars.iv, 1
@@ -193,7 +189,7 @@ for.end:
 ;
 ; Extension is required before the reduction operation & result is truncated
 ;
-define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
+define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
 ; CHECK-LABEL: @reduction_or_trunc(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
@@ -203,72 +199,70 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9]] = zext <4 x i16> [[TMP8]] to <4 x i32>
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16>
+; CHECK-NEXT:    [[TMP8]] = zext <4 x i16> [[TMP7]] to <4 x i32>
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP10]])
-; CHECK-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP11]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP10]] to i32
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 256, 256
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP12]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 256, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP13]], [[VEC_EPILOG_PH]] ], [ [[TMP23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = and <4 x i32> [[VEC_PHI3]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16* [[TMP17]] to <4 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP18]], align 2
-; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i16> [[WIDE_LOAD4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP15]], [[TMP19]]
-; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i32 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 256
-; CHECK-NEXT:    [[TMP22:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23]] = zext <4 x i16> [[TMP22]] to <4 x i32>
-; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP12]], [[VEC_EPILOG_PH]] ], [ [[TMP21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX2]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = and <4 x i32> [[VEC_PHI3]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i16>, ptr [[TMP16]], align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = zext <4 x i16> [[WIDE_LOAD4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i32 [[INDEX2]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 256
+; CHECK-NEXT:    [[TMP20:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16>
+; CHECK-NEXT:    [[TMP21]] = zext <4 x i16> [[TMP20]] to <4 x i32>
+; CHECK-NEXT:    br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP24:%.*]] = trunc <4 x i32> [[TMP23]] to <4 x i16>
-; CHECK-NEXT:    [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]])
-; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP25]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc <4 x i32> [[TMP21]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP22]])
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP23]] to i32
 ; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i32 256, 256
 ; CHECK-NEXT:    br i1 [[CMP_N1]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 256, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX6:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX6:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SUM_02P:%.*]] = phi i32 [ [[XOR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX6]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SUM_02:%.*]] = and i32 [[SUM_02P]], 65535
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i16, i16* [[GEP]], align 2
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i32 [[IV]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i16, ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[EXT:%.*]] = zext i16 [[LOAD]] to i32
 ; CHECK-NEXT:    [[XOR]] = or i32 [[SUM_02]], [[EXT]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[TMP26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[XOR_LCSSA]] to i16
 ; CHECK-NEXT:    ret i16 [[RET]]
 ;
@@ -279,8 +273,8 @@ for.body:
   %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
   %sum.02p = phi i32 [ %xor, %for.body ], [ 0, %entry ]
   %sum.02 = and i32 %sum.02p, 65535
-  %gep = getelementptr inbounds i16, i16* %ptr, i32 %iv
-  %load = load i16, i16* %gep
+  %gep = getelementptr inbounds i16, ptr %ptr, i32 %iv
+  %load = load i16, ptr %gep
   %ext = zext i16 %load to i32
   %xor = or i32 %sum.02, %ext
   %iv.next = add i32 %iv, 1
@@ -295,7 +289,7 @@ for.end:
 ;
 ; More than one reduction in the loop
 ;
-define float @multiple_fp_rdx(float* %A, i64 %N) {
+define float @multiple_fp_rdx(ptr %A, i64 %N) {
 ; CHECK-LABEL: @multiple_fp_rdx(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
@@ -309,21 +303,20 @@ define float @multiple_fp_rdx(float* %A, i64 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 1.500000e+01, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x float> [ <float 1.000000e+01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 1.500000e+01, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x float> [ <float 1.000000e+01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4]] = fadd fast <4 x float> [[VEC_PHI2]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = fadd fast <4 x float> [[VEC_PHI2]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
@@ -331,52 +324,51 @@ define float @multiple_fp_rdx(float* %A, i64 %N) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.500000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP8]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX3:%.*]] = phi float [ 1.000000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.500000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX3:%.*]] = phi float [ 1.000000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, float [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX3]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, float [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX3]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <4 x float> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <4 x float> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP15]] = fadd fast <4 x float> [[VEC_PHI9]], [[WIDE_LOAD10]]
-; CHECK-NEXT:    [[TMP16]] = fmul fast <4 x float> [[VEC_PHI8]], [[WIDE_LOAD10]]
-; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <4 x float> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <4 x float> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX7]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13]] = fadd fast <4 x float> [[VEC_PHI9]], [[WIDE_LOAD10]]
+; CHECK-NEXT:    [[TMP14]] = fmul fast <4 x float> [[VEC_PHI8]], [[WIDE_LOAD10]]
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
-; CHECK-NEXT:    [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP16]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP14]])
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX12:%.*]] = phi float [ 1.500000e+01, [[ITER_CHECK]] ], [ [[TMP8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi float [ 1.000000e+01, [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX12:%.*]] = phi float [ 1.500000e+01, [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi float [ 1.000000e+01, [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[PROD:%.*]] = phi float [ [[BC_MERGE_RDX12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast float [[SUM]], [[TMP20]]
-; CHECK-NEXT:    [[MUL]] = fmul fast float [[PROD]], [[TMP20]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = fadd fast float [[SUM]], [[TMP18]]
+; CHECK-NEXT:    [[MUL]] = fmul fast float [[PROD]], [[TMP18]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv float [[MUL_LCSSA]], [[ADD_LCSSA]]
 ; CHECK-NEXT:    ret float [[DIV]]
 ;
@@ -387,8 +379,8 @@ for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %prod = phi float [ 15.000000e+00, %entry ], [ %mul, %for.body ]
   %sum = phi float [ 10.000000e+00, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %A, i64 %iv
-  %0 = load float, float* %arrayidx
+  %arrayidx = getelementptr inbounds float, ptr %A, i64 %iv
+  %0 = load float, ptr %arrayidx
   %add = fadd fast float %sum, %0
   %mul = fmul fast float %prod, %0
   %iv.next = add nuw nsw i64 %iv, 1
@@ -403,7 +395,7 @@ for.end:
 ;
 ; Start value of the reduction is a Phi node from the outer loop
 ;
-define i32 @reduction_phi_start_val(i32* %A, i64 %N) {
+define i32 @reduction_phi_start_val(ptr %A, i64 %N) {
 ; CHECK-LABEL: @reduction_phi_start_val(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[ITER_CHECK:%.*]]
@@ -422,18 +414,17 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
@@ -441,43 +432,42 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START_SUM]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START_SUM]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP13]] = sub <4 x i32> [[VEC_PHI6]], [[WIDE_LOAD7]]
-; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP7]], [[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11]] = sub <4 x i32> [[VEC_PHI6]], [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N4]], label [[FOR_COND]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi i32 [ [[START_SUM]], [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi i32 [ [[START_SUM]], [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SUB:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[SUB]] = sub nsw i32 [[SUM]], [[LOAD]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       for.cond:
-; CHECK-NEXT:    [[SUB_LCSSA]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUB_LCSSA]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1
 ; CHECK-NEXT:    [[OUTER_EXITCOND_NOT:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[OUTER_EXITCOND_NOT]], label [[FOR_END:%.*]], label [[ITER_CHECK]]
@@ -496,8 +486,8 @@ for.cond.preheader:
 for.body:
   %iv = phi i64 [ 0, %for.cond.preheader ], [ %iv.next, %for.body ]
   %sum = phi i32 [ %start.sum, %for.cond.preheader ], [ %sub, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %iv
-  %load = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
+  %load = load i32, ptr %arrayidx, align 4
   %sub = sub nsw i32 %sum, %load
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %N

diff  --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
index 51afa12c0e0e..850178538cbf 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
@@ -47,7 +47,7 @@ for.cond1.for.end_crit_edge:                      ; preds = %for.body
 
 ; The 'previous' instruction of %for.2 is in a separate block.
 ; PR54195.
-define void @multiple_recurrences_with_previous_in_
diff erent_block(i32 %a, i8 %b, i64* %ptr) {
+define void @multiple_recurrences_with_previous_in_
diff erent_block(i32 %a, i8 %b, ptr %ptr) {
 ; CHECK-LABEL: @multiple_recurrences_with_previous_in_
diff erent_block(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
@@ -62,8 +62,8 @@ define void @multiple_recurrences_with_previous_in_
diff erent_block(i32 %a, i8 %b
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[PTR_GEP:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[SUB]], i64* [[PTR_GEP]], align 4
+; CHECK-NEXT:    [[PTR_GEP:%.*]] = getelementptr inbounds i64, ptr [[PTR:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[SUB]], ptr [[PTR_GEP]], align 4
 ; CHECK-NEXT:    [[FOR_2_NEXT]] = zext i32 [[A:%.*]] to i64
 ; CHECK-NEXT:    br label [[LOOP_HEADER]]
 ; CHECK:       exit:
@@ -84,8 +84,8 @@ loop.header:
   br i1 %exitcond, label %exit, label %loop.latch
 
 loop.latch:
-  %ptr.gep = getelementptr inbounds i64, i64* %ptr, i64 %iv
-  store i64 %sub, i64* %ptr.gep
+  %ptr.gep = getelementptr inbounds i64, ptr %ptr, i64 %iv
+  store i64 %sub, ptr %ptr.gep
   %for.2.next = zext i32 %a to i64
   br label %loop.header
 
@@ -93,7 +93,7 @@ exit:
   ret void
 }
 
-define void @test_pr54223_sink_after_insertion_order(float* noalias %a, float* noalias %b, float* noalias %dst) {
+define void @test_pr54223_sink_after_insertion_order(ptr noalias %a, ptr noalias %b, ptr noalias %dst) {
 ; CHECK-LABEL: @test_pr54223_sink_after_insertion_order(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -104,23 +104,22 @@ define void @test_pr54223_sink_after_insertion_order(float* noalias %a, float* n
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x float> [ <float poison, float poison, float poison, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x float> [ <float poison, float poison, float poison, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR]], <4 x float> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP4]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR1]], <4 x float> [[BROADCAST_SPLAT3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fneg <4 x float> [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x float> [[BROADCAST_SPLAT]], i32 3
@@ -139,11 +138,11 @@ define void @test_pr54223_sink_after_insertion_order(float* noalias %a, float* n
 ; CHECK-NEXT:    [[SCALAR_RECUR7:%.*]] = phi float [ [[SCALAR_RECUR_INIT6]], [[SCALAR_PH]] ], [ [[FOR_2_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[NEG:%.*]] = fneg float [[SCALAR_RECUR7]]
 ; CHECK-NEXT:    [[MULADD:%.*]] = call float @llvm.fmuladd.f32(float [[SCALAR_RECUR]], float [[NEG]], float 0.000000e+00)
-; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[IV]]
+; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[FOR_1_NEXT]] = load float, float* [[A]], align 4
-; CHECK-NEXT:    [[FOR_2_NEXT]] = load float, float* [[B]], align 4
-; CHECK-NEXT:    store float [[MULADD]], float* [[DST_GEP]], align 4
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load float, ptr [[A]], align 4
+; CHECK-NEXT:    [[FOR_2_NEXT]] = load float, ptr [[B]], align 4
+; CHECK-NEXT:    store float [[MULADD]], ptr [[DST_GEP]], align 4
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       exit:
@@ -158,11 +157,11 @@ loop:
   %for.2 = phi float [ 0.0, %entry ], [ %for.2.next, %loop ]
   %neg = fneg float %for.2
   %muladd = call float @llvm.fmuladd.f32(float %for.1, float %neg, float 0.000000e+00)
-  %dst.gep = getelementptr inbounds float, float* %dst, i64 %iv
+  %dst.gep = getelementptr inbounds float, ptr %dst, i64 %iv
   %iv.next = add nuw nsw i64 %iv, 1
-  %for.1.next = load float, float* %a, align 4
-  %for.2.next = load float, float* %b, align 4
-  store float %muladd, float* %dst.gep
+  %for.1.next = load float, ptr %a, align 4
+  %for.2.next = load float, ptr %b, align 4
+  store float %muladd, ptr %dst.gep
   %exitcond.not = icmp eq i64 %iv.next, 10000
   br i1 %exitcond.not, label %exit, label %loop
 
@@ -172,7 +171,7 @@ exit:
 
 declare float @llvm.fmuladd.f32(float, float, float) #1
 
-define void @test_pr54227(i32* noalias %a, i32* noalias %b) {
+define void @test_pr54227(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: @test_pr54227(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -186,10 +185,10 @@ define void @test_pr54227(i32* noalias %a, i32* noalias %b) {
 ; CHECK-NEXT:    [[AND1]] = and i32 [[E_0]], [[F_0]]
 ; CHECK-NEXT:    [[MUL]] = shl nsw i32 [[OR]], 1
 ; CHECK-NEXT:    [[ADD]] = or i32 [[AND1]], 1
-; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[A_GEP]], align 4
-; CHECK-NEXT:    [[B_GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[B_GEP]], align 4
+; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[A_GEP]], align 4
+; CHECK-NEXT:    [[B_GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[B_GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
@@ -209,10 +208,10 @@ loop:
   %and1 = and i32 %e.0, %f.0
   %mul = shl nsw i32 %or, 1
   %add = or i32 %and1, 1
-  %a.gep = getelementptr inbounds i32, i32* %a, i64 %iv
-  store i32 %add, i32* %a.gep, align 4
-  %b.gep = getelementptr inbounds i32, i32* %a, i64 %iv
-  store i32 %mul, i32* %b.gep, align 4
+  %a.gep = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %a.gep, align 4
+  %b.gep = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %mul, ptr %b.gep, align 4
   %iv.next = add nuw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 1000
   br i1 %exitcond, label %exit, label %loop
@@ -221,7 +220,7 @@ exit:
   ret void
 }
 
-define void @test_pr54233_for_depend_on_each_other(i32* noalias %a, i32* noalias %b) {
+define void @test_pr54233_for_depend_on_each_other(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: @test_pr54233_for_depend_on_each_other(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -234,9 +233,9 @@ define void @test_pr54233_for_depend_on_each_other(i32* noalias %a, i32* noalias
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[SHL]], 255
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[XOR]], [[OR]]
 ; CHECK-NEXT:    [[FOR_1_NEXT]] = xor i32 12, [[FOR_2]]
-; CHECK-NEXT:    [[FOR_2_NEXT]] = load i32, i32* [[B:%.*]], align 4
-; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[AND]], i32* [[A_GEP]], align 4
+; CHECK-NEXT:    [[FOR_2_NEXT]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[AND]], ptr [[A_GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
@@ -255,9 +254,9 @@ loop:
   %xor = xor i32 %shl, 255
   %and = and i32 %xor, %or
   %for.1.next = xor i32 12, %for.2
-  %for.2.next = load i32, i32* %b
-  %a.gep = getelementptr inbounds i32, i32* %a, i64 %iv
-  store i32 %and, i32* %a.gep, align 4
+  %for.2.next = load i32, ptr %b
+  %a.gep = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %and, ptr %a.gep, align 4
   %iv.next = add nuw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 1000
   br i1 %exitcond, label %exit, label %loop
@@ -266,7 +265,7 @@ exit:
   ret void
 }
 
-define void @pr54218(i8* %dst, i32* noalias %d) {
+define void @pr54218(ptr %dst, ptr noalias %d) {
 ; CHECK-LABEL: @pr54218(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
@@ -279,10 +278,10 @@ define void @pr54218(i8* %dst, i32* noalias %d) {
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[FOR_1_NEXT]] = load i32, i32* [[D:%.*]], align 4
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i32, ptr [[D:%.*]], align 4
 ; CHECK-NEXT:    [[P_NEXT]] = add i8 [[SEL]], -1
-; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store i8 [[SEL]], i8* [[DST_GEP]], align 1
+; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store i8 [[SEL]], ptr [[DST_GEP]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    br label [[LOOP_HEADER]]
 ; CHECK:       exit:
@@ -301,10 +300,10 @@ loop.header:
   br i1 %exitcond, label %exit, label %loop.latch
 
 loop.latch:
-  %for.1.next = load i32, i32* %d, align 4
+  %for.1.next = load i32, ptr %d, align 4
   %p.next = add i8 %sel, -1
-  %dst.gep = getelementptr inbounds i8, i8* %dst, i64 %iv
-  store i8 %sel, i8* %dst.gep
+  %dst.gep = getelementptr inbounds i8, ptr %dst, i64 %iv
+  store i8 %sel, ptr %dst.gep
   %iv.next = add nuw i64 %iv, 1
   br label %loop.header
 
@@ -312,7 +311,7 @@ exit:
   ret void
 }
 
-define void @pr54254_fors_depend_on_each_other(i32* %dst) {
+define void @pr54254_fors_depend_on_each_other(ptr %dst) {
 ; CHECK-LABEL: @pr54254_fors_depend_on_each_other(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -325,9 +324,9 @@ define void @pr54254_fors_depend_on_each_other(i32* %dst) {
 ; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[F_0]], [[NEG]]
 ; CHECK-NEXT:    [[XOR]] = xor i32 [[C_0]], 1
 ; CHECK-NEXT:    [[XOR1]] = xor i32 [[REM]], 1
-; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[REM_NEXT]] = add i32 [[REM]], 10
-; CHECK-NEXT:    store i32 [[REM]], i32* [[DST_GEP]], align 4
+; CHECK-NEXT:    store i32 [[REM]], ptr [[DST_GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
@@ -346,9 +345,9 @@ loop:
   %rem = srem i32 %f.0, %neg
   %xor = xor i32 %c.0, 1
   %xor1 = xor i32 %rem, 1
-  %dst.gep = getelementptr inbounds i32, i32* %dst, i64 %iv
+  %dst.gep = getelementptr inbounds i32, ptr %dst, i64 %iv
   %rem.next = add i32 %rem, 10
-  store i32 %rem, i32* %dst.gep
+  store i32 %rem, ptr %dst.gep
   %iv.next = add nuw i64 %iv, 1
   %exitcond = icmp eq i64 %iv, 1000
   br i1 %exitcond, label %exit, label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
index 1e58679acca9..43229dc83d52 100644
--- a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
@@ -4,16 +4,16 @@
 ; The function finds the smallest value from a float vector.
 ; Check if vectorization is enabled by instruction flag `fcmp nnan`.
 
-define float @minloop(float* nocapture readonly %arg) {
+define float @minloop(ptr nocapture readonly %arg) {
 ; CHECK-LABEL: @minloop(
 ; CHECK-NEXT:  top:
-; CHECK-NEXT:    [[T:%.*]] = load float, float* [[ARG:%.*]], align 4
+; CHECK-NEXT:    [[T:%.*]] = load float, ptr [[ARG:%.*]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ 1, [[TOP:%.*]] ]
 ; CHECK-NEXT:    [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[T]], [[TOP]] ]
-; CHECK-NEXT:    [[T3:%.*]] = getelementptr float, float* [[ARG]], i64 [[T1]]
-; CHECK-NEXT:    [[T4:%.*]] = load float, float* [[T3]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr float, ptr [[ARG]], i64 [[T1]]
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[T3]], align 4
 ; CHECK-NEXT:    [[T5:%.*]] = fcmp nnan olt float [[T2]], [[T4]]
 ; CHECK-NEXT:    [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]]
 ; CHECK-NEXT:    [[T7]] = add i64 [[T1]], 1
@@ -24,14 +24,14 @@ define float @minloop(float* nocapture readonly %arg) {
 ; CHECK-NEXT:    ret float [[T6_LCSSA]]
 ;
 top:
-  %t = load float, float* %arg
+  %t = load float, ptr %arg
   br label %loop
 
 loop:                                             ; preds = %loop, %top
   %t1 = phi i64 [ %t7, %loop ], [ 1, %top ]
   %t2 = phi float [ %t6, %loop ], [ %t, %top ]
-  %t3 = getelementptr float, float* %arg, i64 %t1
-  %t4 = load float, float* %t3, align 4
+  %t3 = getelementptr float, ptr %arg, i64 %t1
+  %t4 = load float, ptr %t3, align 4
   %t5 = fcmp nnan olt float %t2, %t4
   %t6 = select i1 %t5, float %t2, float %t4
   %t7 = add i64 %t1, 1
@@ -44,10 +44,10 @@ out:                                              ; preds = %loop
 
 ; Check if vectorization is still enabled by function attribute.
 
-define float @minloopattr(float* nocapture readonly %arg) #0 {
+define float @minloopattr(ptr nocapture readonly %arg) #0 {
 ; CHECK-LABEL: @minloopattr(
 ; CHECK-NEXT:  top:
-; CHECK-NEXT:    [[T:%.*]] = load float, float* [[ARG:%.*]], align 4
+; CHECK-NEXT:    [[T:%.*]] = load float, ptr [[ARG:%.*]], align 4
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[T]], i32 0
@@ -55,49 +55,48 @@ define float @minloopattr(float* nocapture readonly %arg) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, float* [[ARG]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[ARG]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 65536, 65536
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 65537, [[MIDDLE_BLOCK]] ], [ 1, [[TOP:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[T3:%.*]] = getelementptr float, float* [[ARG]], i64 [[T1]]
-; CHECK-NEXT:    [[T4:%.*]] = load float, float* [[T3]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr float, ptr [[ARG]], i64 [[T1]]
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[T3]], align 4
 ; CHECK-NEXT:    [[T5:%.*]] = fcmp olt float [[T2]], [[T4]]
 ; CHECK-NEXT:    [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]]
 ; CHECK-NEXT:    [[T7]] = add i64 [[T1]], 1
 ; CHECK-NEXT:    [[T8:%.*]] = icmp eq i64 [[T7]], 65537
-; CHECK-NEXT:    br i1 [[T8]], label [[OUT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[T8]], label [[OUT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       out:
-; CHECK-NEXT:    [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[T6_LCSSA]]
 ;
 top:
-  %t = load float, float* %arg
+  %t = load float, ptr %arg
   br label %loop
 
 loop:                                             ; preds = %loop, %top
   %t1 = phi i64 [ %t7, %loop ], [ 1, %top ]
   %t2 = phi float [ %t6, %loop ], [ %t, %top ]
-  %t3 = getelementptr float, float* %arg, i64 %t1
-  %t4 = load float, float* %t3, align 4
+  %t3 = getelementptr float, ptr %arg, i64 %t1
+  %t4 = load float, ptr %t3, align 4
   %t5 = fcmp olt float %t2, %t4
   %t6 = select i1 %t5, float %t2, float %t4
   %t7 = add i64 %t1, 1
@@ -110,16 +109,16 @@ out:                                              ; preds = %loop
 
 ; Check if vectorization is prevented without the flag or attribute.
 
-define float @minloopnovec(float* nocapture readonly %arg) {
+define float @minloopnovec(ptr nocapture readonly %arg) {
 ; CHECK-LABEL: @minloopnovec(
 ; CHECK-NEXT:  top:
-; CHECK-NEXT:    [[T:%.*]] = load float, float* [[ARG:%.*]], align 4
+; CHECK-NEXT:    [[T:%.*]] = load float, ptr [[ARG:%.*]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ 1, [[TOP:%.*]] ]
 ; CHECK-NEXT:    [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[T]], [[TOP]] ]
-; CHECK-NEXT:    [[T3:%.*]] = getelementptr float, float* [[ARG]], i64 [[T1]]
-; CHECK-NEXT:    [[T4:%.*]] = load float, float* [[T3]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr float, ptr [[ARG]], i64 [[T1]]
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[T3]], align 4
 ; CHECK-NEXT:    [[T5:%.*]] = fcmp olt float [[T2]], [[T4]]
 ; CHECK-NEXT:    [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]]
 ; CHECK-NEXT:    [[T7]] = add i64 [[T1]], 1
@@ -130,14 +129,14 @@ define float @minloopnovec(float* nocapture readonly %arg) {
 ; CHECK-NEXT:    ret float [[T6_LCSSA]]
 ;
 top:
-  %t = load float, float* %arg
+  %t = load float, ptr %arg
   br label %loop
 
 loop:                                             ; preds = %loop, %top
   %t1 = phi i64 [ %t7, %loop ], [ 1, %top ]
   %t2 = phi float [ %t6, %loop ], [ %t, %top ]
-  %t3 = getelementptr float, float* %arg, i64 %t1
-  %t4 = load float, float* %t3, align 4
+  %t3 = getelementptr float, ptr %arg, i64 %t1
+  %t4 = load float, ptr %t3, align 4
   %t5 = fcmp olt float %t2, %t4
   %t6 = select i1 %t5, float %t2, float %t4
   %t7 = add i64 %t1, 1
@@ -150,16 +149,16 @@ out:                                              ; preds = %loop
 
 ; This test is checking that we don't vectorize when only one of the required attributes is set.
 ; Note that this test should not vectorize even after switching to IR-level FMF.
-define float @minloopmissingnsz(float* nocapture readonly %arg) #1 {
+define float @minloopmissingnsz(ptr nocapture readonly %arg) #1 {
 ; CHECK-LABEL: @minloopmissingnsz(
 ; CHECK-NEXT:  top:
-; CHECK-NEXT:    [[T:%.*]] = load float, float* [[ARG:%.*]], align 4
+; CHECK-NEXT:    [[T:%.*]] = load float, ptr [[ARG:%.*]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ 1, [[TOP:%.*]] ]
 ; CHECK-NEXT:    [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[T]], [[TOP]] ]
-; CHECK-NEXT:    [[T3:%.*]] = getelementptr float, float* [[ARG]], i64 [[T1]]
-; CHECK-NEXT:    [[T4:%.*]] = load float, float* [[T3]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr float, ptr [[ARG]], i64 [[T1]]
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[T3]], align 4
 ; CHECK-NEXT:    [[T5:%.*]] = fcmp olt float [[T2]], [[T4]]
 ; CHECK-NEXT:    [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]]
 ; CHECK-NEXT:    [[T7]] = add i64 [[T1]], 1
@@ -170,14 +169,14 @@ define float @minloopmissingnsz(float* nocapture readonly %arg) #1 {
 ; CHECK-NEXT:    ret float [[T6_LCSSA]]
 ;
 top:
-  %t = load float, float* %arg
+  %t = load float, ptr %arg
   br label %loop
 
 loop:                                             ; preds = %loop, %top
   %t1 = phi i64 [ %t7, %loop ], [ 1, %top ]
   %t2 = phi float [ %t6, %loop ], [ %t, %top ]
-  %t3 = getelementptr float, float* %arg, i64 %t1
-  %t4 = load float, float* %t3, align 4
+  %t3 = getelementptr float, ptr %arg, i64 %t1
+  %t4 = load float, ptr %t3, align 4
   %t5 = fcmp olt float %t2, %t4
   %t6 = select i1 %t5, float %t2, float %t4
   %t7 = add i64 %t1, 1

diff  --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 265f49c40a18..a72ab450a909 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
+define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -17,11 +17,12 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 4
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -29,26 +30,23 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !3
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP14:%.*]] = and <4 x i1> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP16:%.*]] = and <4 x i1> [[TMP10]], [[TMP15]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP16]], <4 x i32> [[TMP13]], <4 x i32> [[PREDPHI]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[PREDPHI7]], <4 x i32>* [[TMP17]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <4 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP15:%.*]] = and <4 x i1> [[TMP9]], [[TMP14]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
+; CHECK-NEXT:    [[PREDPHI3:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP12]], <4 x i32> [[PREDPHI]]
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI3]], ptr [[TMP7]], align 4, !alias.scope !0, !noalias !3
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -57,22 +55,22 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP19]], 19
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], 19
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[TMP20]], 4
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[TMP18]], 4
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP10]], i32 4, i32 5
 ; CHECK-NEXT:    br label [[IF_END14]]
 ; CHECK:       if.end14:
 ; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 9, [[FOR_BODY]] ], [ 3, [[IF_THEN]] ], [ [[DOT]], [[IF_ELSE]] ]
-; CHECK-NEXT:    store i32 [[X_0]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i32 [[X_0]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
@@ -88,10 +86,10 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ %indvars.iv.next, %if.end14 ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
   %cmp3 = icmp sgt i32 %0, %1
   br i1 %cmp3, label %if.then, label %if.end14
 
@@ -106,7 +104,7 @@ if.else:
 
 if.end14:
   %x.0 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %., %if.else ]  ; <------------- A PHI with 3 entries that we can still vectorize.
-  store i32 %x.0, i32* %arrayidx, align 4
+  store i32 %x.0, ptr %arrayidx, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %n

diff  --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
index 8aabbb2ba09e..0310f7077392 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -7,139 +7,123 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; Test predication of non-void instructions, specifically (i) that these
 ; instructions permit vectorization and (ii) the creation of an insertelement
 ; and a Phi node. We check the full 2-element sequence for all predicate instructions.
-define void @test(i32* nocapture %asd, i32* nocapture %aud,
+define void @test(ptr nocapture %asd, ptr nocapture %aud,
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8*
-; CHECK-NEXT:    [[AUD3:%.*]] = bitcast i32* [[AUD:%.*]] to i8*
-; CHECK-NEXT:    [[ASR6:%.*]] = bitcast i32* [[ASR:%.*]] to i8*
-; CHECK-NEXT:    [[AUR9:%.*]] = bitcast i32* [[AUR:%.*]] to i8*
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[AUD]], i64 128
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[ASR]], i64 128
-; CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
-; CHECK-NEXT:    [[SCEVGEP10:%.*]] = getelementptr i32, i32* [[AUR]], i64 128
-; CHECK-NEXT:    [[SCEVGEP1011:%.*]] = bitcast i32* [[SCEVGEP10]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[AUD:%.*]], i64 512
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[ASR:%.*]], i64 512
+; CHECK-NEXT:    [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[AUR:%.*]], i64 512
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT14]]
-; CHECK-NEXT:    [[BOUND015:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[BOUND116:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP2]]
+; CHECK-NEXT:    [[BOUND15:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
+; CHECK-NEXT:    [[BOUND07:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP3]]
+; CHECK-NEXT:    [[BOUND18:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]]
+; CHECK-NEXT:    [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]]
+; CHECK-NEXT:    [[BOUND011:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP2]]
+; CHECK-NEXT:    [[BOUND112:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]]
+; CHECK-NEXT:    [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]]
+; CHECK-NEXT:    [[BOUND015:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP3]]
+; CHECK-NEXT:    [[BOUND116:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]]
-; CHECK-NEXT:    [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT17]]
-; CHECK-NEXT:    [[BOUND019:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[BOUND120:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP45]]
+; CHECK-NEXT:    [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]]
+; CHECK-NEXT:    [[BOUND019:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP3]]
+; CHECK-NEXT:    [[BOUND120:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP2]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT21:%.*]] = and i1 [[BOUND019]], [[BOUND120]]
 ; CHECK-NEXT:    [[CONFLICT_RDX22:%.*]] = or i1 [[CONFLICT_RDX18]], [[FOUND_CONFLICT21]]
-; CHECK-NEXT:    [[BOUND023:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[BOUND124:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[FOUND_CONFLICT25:%.*]] = and i1 [[BOUND023]], [[BOUND124]]
-; CHECK-NEXT:    [[CONFLICT_RDX26:%.*]] = or i1 [[CONFLICT_RDX22]], [[FOUND_CONFLICT25]]
-; CHECK-NEXT:    [[BOUND027:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1011]]
-; CHECK-NEXT:    [[BOUND128:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[FOUND_CONFLICT29:%.*]] = and i1 [[BOUND027]], [[BOUND128]]
-; CHECK-NEXT:    [[CONFLICT_RDX30:%.*]] = or i1 [[CONFLICT_RDX26]], [[FOUND_CONFLICT29]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX30]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX22]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE35:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE27:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4, !alias.scope !5, !noalias !8
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4, !alias.scope !12, !noalias !13
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i32>, <2 x i32>* [[TMP10]], align 4, !alias.scope !14, !noalias !15
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i32>, <2 x i32>* [[TMP12]], align 4, !alias.scope !15
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], <i32 23, i32 23>
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <2 x i32> [[WIDE_LOAD31]], <i32 24, i32 24>
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <2 x i32> [[WIDE_LOAD32]], <i32 25, i32 25>
-; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <2 x i32> [[WIDE_LOAD33]], <i32 26, i32 26>
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], <i32 100, i32 100>
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[TMP17]], i32 0
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4, !alias.scope !5, !noalias !8
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope !12, !noalias !13
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4, !alias.scope !14, !noalias !15
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4, !alias.scope !15
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], <i32 23, i32 23>
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[WIDE_LOAD23]], <i32 24, i32 24>
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i32> [[WIDE_LOAD24]], <i32 25, i32 25>
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i32> [[WIDE_LOAD25]], <i32 26, i32 26>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], <i32 100, i32 100>
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]]
 ; CHECK:       pred.urem.if:
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = sdiv i32 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = sdiv i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD23]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = udiv i32 [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD31]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = udiv i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD24]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = srem i32 [[TMP23]], [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> poison, i32 [[TMP25]], i32 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i32> [[WIDE_LOAD32]], i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = srem i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i32> [[WIDE_LOAD25]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = urem i32 [[TMP27]], [[TMP28]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD33]], i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = urem i32 [[TMP31]], [[TMP32]]
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x i32> poison, i32 [[TMP33]], i32 0
 ; CHECK-NEXT:    br label [[PRED_UREM_CONTINUE]]
 ; CHECK:       pred.urem.continue:
-; CHECK-NEXT:    [[TMP35:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP22]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP36:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP26]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP37:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP30]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP38:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP34]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP17]], i32 1
-; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_UREM_IF34:%.*]], label [[PRED_UREM_CONTINUE35]]
-; CHECK:       pred.urem.if34:
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP42:%.*]] = sdiv i32 [[TMP40]], [[TMP41]]
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP42]], i32 1
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[WIDE_LOAD31]], i32 1
-; CHECK-NEXT:    [[TMP46:%.*]] = udiv i32 [[TMP44]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <2 x i32> [[TMP36]], i32 [[TMP46]], i32 1
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[WIDE_LOAD32]], i32 1
-; CHECK-NEXT:    [[TMP50:%.*]] = srem i32 [[TMP48]], [[TMP49]]
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <2 x i32> [[TMP37]], i32 [[TMP50]], i32 1
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <2 x i32> [[WIDE_LOAD33]], i32 1
-; CHECK-NEXT:    [[TMP54:%.*]] = urem i32 [[TMP52]], [[TMP53]]
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP54]], i32 1
-; CHECK-NEXT:    br label [[PRED_UREM_CONTINUE35]]
-; CHECK:       pred.urem.continue35:
-; CHECK-NEXT:    [[TMP56:%.*]] = phi <2 x i32> [ [[TMP35]], [[PRED_UREM_CONTINUE]] ], [ [[TMP43]], [[PRED_UREM_IF34]] ]
-; CHECK-NEXT:    [[TMP57:%.*]] = phi <2 x i32> [ [[TMP36]], [[PRED_UREM_CONTINUE]] ], [ [[TMP47]], [[PRED_UREM_IF34]] ]
-; CHECK-NEXT:    [[TMP58:%.*]] = phi <2 x i32> [ [[TMP37]], [[PRED_UREM_CONTINUE]] ], [ [[TMP51]], [[PRED_UREM_IF34]] ]
-; CHECK-NEXT:    [[TMP59:%.*]] = phi <2 x i32> [ [[TMP38]], [[PRED_UREM_CONTINUE]] ], [ [[TMP55]], [[PRED_UREM_IF34]] ]
-; CHECK-NEXT:    [[TMP60:%.*]] = xor <2 x i1> [[TMP17]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP60]], <2 x i32> [[TMP13]], <2 x i32> [[TMP56]]
-; CHECK-NEXT:    [[PREDPHI36:%.*]] = select <2 x i1> [[TMP60]], <2 x i32> [[TMP14]], <2 x i32> [[TMP57]]
-; CHECK-NEXT:    [[PREDPHI37:%.*]] = select <2 x i1> [[TMP60]], <2 x i32> [[TMP15]], <2 x i32> [[TMP58]]
-; CHECK-NEXT:    [[PREDPHI38:%.*]] = select <2 x i1> [[TMP60]], <2 x i32> [[TMP16]], <2 x i32> [[TMP59]]
-; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], <2 x i32>* [[TMP62]], align 4, !alias.scope !5, !noalias !8
-; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI36]], <2 x i32>* [[TMP64]], align 4, !alias.scope !12, !noalias !13
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP66:%.*]] = bitcast i32* [[TMP65]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI37]], <2 x i32>* [[TMP66]], align 4, !alias.scope !14, !noalias !15
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP68:%.*]] = bitcast i32* [[TMP67]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI38]], <2 x i32>* [[TMP68]], align 4, !alias.scope !15
+; CHECK-NEXT:    [[TMP31:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP18]], [[PRED_UREM_IF]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP22]], [[PRED_UREM_IF]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP26]], [[PRED_UREM_IF]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP30]], [[PRED_UREM_IF]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
+; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_UREM_IF26:%.*]], label [[PRED_UREM_CONTINUE27]]
+; CHECK:       pred.urem.if26:
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP38:%.*]] = sdiv i32 [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> [[TMP31]], i32 [[TMP38]], i32 1
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[WIDE_LOAD23]], i32 1
+; CHECK-NEXT:    [[TMP42:%.*]] = udiv i32 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP42]], i32 1
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[WIDE_LOAD24]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = srem i32 [[TMP44]], [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP46]], i32 1
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[WIDE_LOAD25]], i32 1
+; CHECK-NEXT:    [[TMP50:%.*]] = urem i32 [[TMP48]], [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[TMP50]], i32 1
+; CHECK-NEXT:    br label [[PRED_UREM_CONTINUE27]]
+; CHECK:       pred.urem.continue27:
+; CHECK-NEXT:    [[TMP52:%.*]] = phi <2 x i32> [ [[TMP31]], [[PRED_UREM_CONTINUE]] ], [ [[TMP39]], [[PRED_UREM_IF26]] ]
+; CHECK-NEXT:    [[TMP53:%.*]] = phi <2 x i32> [ [[TMP32]], [[PRED_UREM_CONTINUE]] ], [ [[TMP43]], [[PRED_UREM_IF26]] ]
+; CHECK-NEXT:    [[TMP54:%.*]] = phi <2 x i32> [ [[TMP33]], [[PRED_UREM_CONTINUE]] ], [ [[TMP47]], [[PRED_UREM_IF26]] ]
+; CHECK-NEXT:    [[TMP55:%.*]] = phi <2 x i32> [ [[TMP34]], [[PRED_UREM_CONTINUE]] ], [ [[TMP51]], [[PRED_UREM_IF26]] ]
+; CHECK-NEXT:    [[TMP56:%.*]] = xor <2 x i1> [[TMP13]], <i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP9]], <2 x i32> [[TMP52]]
+; CHECK-NEXT:    [[PREDPHI28:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP10]], <2 x i32> [[TMP53]]
+; CHECK-NEXT:    [[PREDPHI29:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP11]], <2 x i32> [[TMP54]]
+; CHECK-NEXT:    [[PREDPHI30:%.*]] = select <2 x i1> [[TMP56]], <2 x i32> [[TMP12]], <2 x i32> [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP57]], align 4, !alias.scope !5, !noalias !8
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI28]], ptr [[TMP58]], align 4, !alias.scope !12, !noalias !13
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI29]], ptr [[TMP59]], align 4, !alias.scope !14, !noalias !15
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI30]], ptr [[TMP60]], align 4, !alias.scope !15
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP69:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP69]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP61]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -150,14 +134,14 @@ define void @test(i32* nocapture %asd, i32* nocapture %aud,
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[IUD:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[ISR:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[IUR:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LSD:%.*]] = load i32, i32* [[ISD]], align 4
-; CHECK-NEXT:    [[LUD:%.*]] = load i32, i32* [[IUD]], align 4
-; CHECK-NEXT:    [[LSR:%.*]] = load i32, i32* [[ISR]], align 4
-; CHECK-NEXT:    [[LUR:%.*]] = load i32, i32* [[IUR]], align 4
+; CHECK-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IUD:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[ISR:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IUR:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[LSD:%.*]] = load i32, ptr [[ISD]], align 4
+; CHECK-NEXT:    [[LUD:%.*]] = load i32, ptr [[IUD]], align 4
+; CHECK-NEXT:    [[LSR:%.*]] = load i32, ptr [[ISR]], align 4
+; CHECK-NEXT:    [[LUR:%.*]] = load i32, ptr [[IUR]], align 4
 ; CHECK-NEXT:    [[PSD:%.*]] = add nsw i32 [[LSD]], 23
 ; CHECK-NEXT:    [[PUD:%.*]] = add nsw i32 [[LUD]], 24
 ; CHECK-NEXT:    [[PSR:%.*]] = add nsw i32 [[LSR]], 25
@@ -175,131 +159,123 @@ define void @test(i32* nocapture %asd, i32* nocapture %aud,
 ; CHECK-NEXT:    [[YUD_0:%.*]] = phi i32 [ [[RUD]], [[IF_THEN]] ], [ [[PUD]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[YSR_0:%.*]] = phi i32 [ [[RSR]], [[IF_THEN]] ], [ [[PSR]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[YUR_0:%.*]] = phi i32 [ [[RUR]], [[IF_THEN]] ], [ [[PUR]], [[FOR_BODY]] ]
-; CHECK-NEXT:    store i32 [[YSD_0]], i32* [[ISD]], align 4
-; CHECK-NEXT:    store i32 [[YUD_0]], i32* [[IUD]], align 4
-; CHECK-NEXT:    store i32 [[YSR_0]], i32* [[ISR]], align 4
-; CHECK-NEXT:    store i32 [[YUR_0]], i32* [[IUR]], align 4
+; CHECK-NEXT:    store i32 [[YSD_0]], ptr [[ISD]], align 4
+; CHECK-NEXT:    store i32 [[YUD_0]], ptr [[IUD]], align 4
+; CHECK-NEXT:    store i32 [[YSR_0]], ptr [[ISR]], align 4
+; CHECK-NEXT:    store i32 [[YUR_0]], ptr [[IUR]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ;
 ; UNROLL-NO-VF-LABEL: @test(
 ; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8*
-; UNROLL-NO-VF-NEXT:    [[AUD3:%.*]] = bitcast i32* [[AUD:%.*]] to i8*
-; UNROLL-NO-VF-NEXT:    [[ASR6:%.*]] = bitcast i32* [[ASR:%.*]] to i8*
-; UNROLL-NO-VF-NEXT:    [[AUR9:%.*]] = bitcast i32* [[AUR:%.*]] to i8*
 ; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; UNROLL-NO-VF:       vector.memcheck:
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[AUD]], i64 128
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[ASR]], i64 128
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP10:%.*]] = getelementptr i32, i32* [[AUR]], i64 128
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP1011:%.*]] = bitcast i32* [[SCEVGEP10]] to i8*
-; UNROLL-NO-VF-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]]
-; UNROLL-NO-VF-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP2]]
+; UNROLL-NO-VF-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512
+; UNROLL-NO-VF-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[AUD:%.*]], i64 512
+; UNROLL-NO-VF-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[ASR:%.*]], i64 512
+; UNROLL-NO-VF-NEXT:    [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[AUR:%.*]], i64 512
+; UNROLL-NO-VF-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]]
+; UNROLL-NO-VF-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP]]
 ; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; UNROLL-NO-VF-NEXT:    [[BOUND012:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP78]]
-; UNROLL-NO-VF-NEXT:    [[BOUND113:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP2]]
-; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; UNROLL-NO-VF-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT14]]
-; UNROLL-NO-VF-NEXT:    [[BOUND015:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1011]]
-; UNROLL-NO-VF-NEXT:    [[BOUND116:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP2]]
+; UNROLL-NO-VF-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP2]]
+; UNROLL-NO-VF-NEXT:    [[BOUND15:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP]]
+; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
+; UNROLL-NO-VF-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
+; UNROLL-NO-VF-NEXT:    [[BOUND07:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP3]]
+; UNROLL-NO-VF-NEXT:    [[BOUND18:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP]]
+; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]]
+; UNROLL-NO-VF-NEXT:    [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]]
+; UNROLL-NO-VF-NEXT:    [[BOUND011:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP2]]
+; UNROLL-NO-VF-NEXT:    [[BOUND112:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP1]]
+; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]]
+; UNROLL-NO-VF-NEXT:    [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]]
+; UNROLL-NO-VF-NEXT:    [[BOUND015:%.*]] = icmp ult ptr [[AUD]], [[UGLYGEP3]]
+; UNROLL-NO-VF-NEXT:    [[BOUND116:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP1]]
 ; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]]
-; UNROLL-NO-VF-NEXT:    [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT17]]
-; UNROLL-NO-VF-NEXT:    [[BOUND019:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP78]]
-; UNROLL-NO-VF-NEXT:    [[BOUND120:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP45]]
+; UNROLL-NO-VF-NEXT:    [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]]
+; UNROLL-NO-VF-NEXT:    [[BOUND019:%.*]] = icmp ult ptr [[ASR]], [[UGLYGEP3]]
+; UNROLL-NO-VF-NEXT:    [[BOUND120:%.*]] = icmp ult ptr [[AUR]], [[UGLYGEP2]]
 ; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT21:%.*]] = and i1 [[BOUND019]], [[BOUND120]]
 ; UNROLL-NO-VF-NEXT:    [[CONFLICT_RDX22:%.*]] = or i1 [[CONFLICT_RDX18]], [[FOUND_CONFLICT21]]
-; UNROLL-NO-VF-NEXT:    [[BOUND023:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1011]]
-; UNROLL-NO-VF-NEXT:    [[BOUND124:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP45]]
-; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT25:%.*]] = and i1 [[BOUND023]], [[BOUND124]]
-; UNROLL-NO-VF-NEXT:    [[CONFLICT_RDX26:%.*]] = or i1 [[CONFLICT_RDX22]], [[FOUND_CONFLICT25]]
-; UNROLL-NO-VF-NEXT:    [[BOUND027:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1011]]
-; UNROLL-NO-VF-NEXT:    [[BOUND128:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP78]]
-; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT29:%.*]] = and i1 [[BOUND027]], [[BOUND128]]
-; UNROLL-NO-VF-NEXT:    [[CONFLICT_RDX30:%.*]] = or i1 [[CONFLICT_RDX26]], [[FOUND_CONFLICT29]]
-; UNROLL-NO-VF-NEXT:    br i1 [[CONFLICT_RDX30]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CONFLICT_RDX22]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE33:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NO-VF-NEXT:    [[INDUCTION31:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION31]]
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDUCTION31]]
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDUCTION31]]
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDUCTION31]]
-; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4, !alias.scope !5, !noalias !8
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !5, !noalias !8
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP2]], align 4, !alias.scope !12, !noalias !13
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !alias.scope !12, !noalias !13
-; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !alias.scope !14, !noalias !15
-; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !alias.scope !14, !noalias !15
-; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !alias.scope !15
-; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !alias.scope !15
-; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = add nsw i32 [[TMP8]], 23
-; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP9]], 23
-; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP10]], 24
-; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP11]], 24
-; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP12]], 25
-; UNROLL-NO-VF-NEXT:    [[TMP21:%.*]] = add nsw i32 [[TMP13]], 25
-; UNROLL-NO-VF-NEXT:    [[TMP22:%.*]] = add nsw i32 [[TMP14]], 26
-; UNROLL-NO-VF-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP15]], 26
-; UNROLL-NO-VF-NEXT:    [[TMP24:%.*]] = icmp slt i32 [[TMP8]], 100
-; UNROLL-NO-VF-NEXT:    [[TMP25:%.*]] = icmp slt i32 [[TMP9]], 100
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP24]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]]
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE24:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]]
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[TMP0]]
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[TMP0]]
+; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[TMP0]]
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP2]], align 4, !alias.scope !5, !noalias !8
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope !5, !noalias !8
+; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP4]], align 4, !alias.scope !12, !noalias !13
+; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !alias.scope !12, !noalias !13
+; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4, !alias.scope !14, !noalias !15
+; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP7]], align 4, !alias.scope !14, !noalias !15
+; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope !15
+; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope !15
+; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP10]], 23
+; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP11]], 23
+; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP12]], 24
+; UNROLL-NO-VF-NEXT:    [[TMP21:%.*]] = add nsw i32 [[TMP13]], 24
+; UNROLL-NO-VF-NEXT:    [[TMP22:%.*]] = add nsw i32 [[TMP14]], 25
+; UNROLL-NO-VF-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP15]], 25
+; UNROLL-NO-VF-NEXT:    [[TMP24:%.*]] = add nsw i32 [[TMP16]], 26
+; UNROLL-NO-VF-NEXT:    [[TMP25:%.*]] = add nsw i32 [[TMP17]], 26
+; UNROLL-NO-VF-NEXT:    [[TMP26:%.*]] = icmp slt i32 [[TMP10]], 100
+; UNROLL-NO-VF-NEXT:    [[TMP27:%.*]] = icmp slt i32 [[TMP11]], 100
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP26]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]]
 ; UNROLL-NO-VF:       pred.urem.if:
-; UNROLL-NO-VF-NEXT:    [[TMP26:%.*]] = sdiv i32 [[TMP16]], [[TMP8]]
-; UNROLL-NO-VF-NEXT:    [[TMP27:%.*]] = udiv i32 [[TMP18]], [[TMP10]]
-; UNROLL-NO-VF-NEXT:    [[TMP28:%.*]] = srem i32 [[TMP20]], [[TMP12]]
-; UNROLL-NO-VF-NEXT:    [[TMP29:%.*]] = urem i32 [[TMP22]], [[TMP14]]
+; UNROLL-NO-VF-NEXT:    [[TMP28:%.*]] = sdiv i32 [[TMP18]], [[TMP10]]
+; UNROLL-NO-VF-NEXT:    [[TMP29:%.*]] = udiv i32 [[TMP20]], [[TMP12]]
+; UNROLL-NO-VF-NEXT:    [[TMP30:%.*]] = srem i32 [[TMP22]], [[TMP14]]
+; UNROLL-NO-VF-NEXT:    [[TMP31:%.*]] = urem i32 [[TMP24]], [[TMP16]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_UREM_CONTINUE]]
 ; UNROLL-NO-VF:       pred.urem.continue:
-; UNROLL-NO-VF-NEXT:    [[TMP30:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP26]], [[PRED_UREM_IF]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP31:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP27]], [[PRED_UREM_IF]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP32:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP28]], [[PRED_UREM_IF]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP33:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP29]], [[PRED_UREM_IF]] ]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP25]], label [[PRED_UREM_IF32:%.*]], label [[PRED_UREM_CONTINUE33]]
-; UNROLL-NO-VF:       pred.urem.if31:
-; UNROLL-NO-VF-NEXT:    [[TMP34:%.*]] = sdiv i32 [[TMP17]], [[TMP9]]
-; UNROLL-NO-VF-NEXT:    [[TMP35:%.*]] = udiv i32 [[TMP19]], [[TMP11]]
-; UNROLL-NO-VF-NEXT:    [[TMP36:%.*]] = srem i32 [[TMP21]], [[TMP13]]
-; UNROLL-NO-VF-NEXT:    [[TMP37:%.*]] = urem i32 [[TMP23]], [[TMP15]]
-; UNROLL-NO-VF-NEXT:    br label [[PRED_UREM_CONTINUE33]]
-; UNROLL-NO-VF:       pred.urem.continue32:
-; UNROLL-NO-VF-NEXT:    [[TMP38:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP34]], [[PRED_UREM_IF32]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP39:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP35]], [[PRED_UREM_IF32]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP40:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP36]], [[PRED_UREM_IF32]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP41:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP37]], [[PRED_UREM_IF32]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP42:%.*]] = xor i1 [[TMP24]], true
-; UNROLL-NO-VF-NEXT:    [[TMP43:%.*]] = xor i1 [[TMP25]], true
-; UNROLL-NO-VF-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP42]], i32 [[TMP16]], i32 [[TMP30]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI34:%.*]] = select i1 [[TMP43]], i32 [[TMP17]], i32 [[TMP38]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI35:%.*]] = select i1 [[TMP42]], i32 [[TMP18]], i32 [[TMP31]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI36:%.*]] = select i1 [[TMP43]], i32 [[TMP19]], i32 [[TMP39]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI37:%.*]] = select i1 [[TMP42]], i32 [[TMP20]], i32 [[TMP32]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI38:%.*]] = select i1 [[TMP43]], i32 [[TMP21]], i32 [[TMP40]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI39:%.*]] = select i1 [[TMP42]], i32 [[TMP22]], i32 [[TMP33]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI40:%.*]] = select i1 [[TMP43]], i32 [[TMP23]], i32 [[TMP41]]
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI]], i32* [[TMP0]], align 4, !alias.scope !5, !noalias !8
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI34]], i32* [[TMP1]], align 4, !alias.scope !5, !noalias !8
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI35]], i32* [[TMP2]], align 4, !alias.scope !12, !noalias !13
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI36]], i32* [[TMP3]], align 4, !alias.scope !12, !noalias !13
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI37]], i32* [[TMP4]], align 4, !alias.scope !14, !noalias !15
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI38]], i32* [[TMP5]], align 4, !alias.scope !14, !noalias !15
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI39]], i32* [[TMP6]], align 4, !alias.scope !15
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI40]], i32* [[TMP7]], align 4, !alias.scope !15
+; UNROLL-NO-VF-NEXT:    [[TMP34:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP30]], [[PRED_UREM_IF]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP35:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP31]], [[PRED_UREM_IF]] ]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP27]], label [[PRED_UREM_IF23:%.*]], label [[PRED_UREM_CONTINUE24]]
+; UNROLL-NO-VF:       pred.urem.if23:
+; UNROLL-NO-VF-NEXT:    [[TMP36:%.*]] = sdiv i32 [[TMP19]], [[TMP11]]
+; UNROLL-NO-VF-NEXT:    [[TMP37:%.*]] = udiv i32 [[TMP21]], [[TMP13]]
+; UNROLL-NO-VF-NEXT:    [[TMP38:%.*]] = srem i32 [[TMP23]], [[TMP15]]
+; UNROLL-NO-VF-NEXT:    [[TMP39:%.*]] = urem i32 [[TMP25]], [[TMP17]]
+; UNROLL-NO-VF-NEXT:    br label [[PRED_UREM_CONTINUE24]]
+; UNROLL-NO-VF:       pred.urem.continue24:
+; UNROLL-NO-VF-NEXT:    [[TMP40:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP36]], [[PRED_UREM_IF23]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP41:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP37]], [[PRED_UREM_IF23]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP42:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP38]], [[PRED_UREM_IF23]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP43:%.*]] = phi i32 [ poison, [[PRED_UREM_CONTINUE]] ], [ [[TMP39]], [[PRED_UREM_IF23]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP44:%.*]] = xor i1 [[TMP26]], true
+; UNROLL-NO-VF-NEXT:    [[TMP45:%.*]] = xor i1 [[TMP27]], true
+; UNROLL-NO-VF-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP44]], i32 [[TMP18]], i32 [[TMP32]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI25:%.*]] = select i1 [[TMP45]], i32 [[TMP19]], i32 [[TMP40]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI26:%.*]] = select i1 [[TMP44]], i32 [[TMP20]], i32 [[TMP33]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI27:%.*]] = select i1 [[TMP45]], i32 [[TMP21]], i32 [[TMP41]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI28:%.*]] = select i1 [[TMP44]], i32 [[TMP22]], i32 [[TMP34]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI29:%.*]] = select i1 [[TMP45]], i32 [[TMP23]], i32 [[TMP42]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI30:%.*]] = select i1 [[TMP44]], i32 [[TMP24]], i32 [[TMP35]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI31:%.*]] = select i1 [[TMP45]], i32 [[TMP25]], i32 [[TMP43]]
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI]], ptr [[TMP2]], align 4, !alias.scope !5, !noalias !8
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI25]], ptr [[TMP3]], align 4, !alias.scope !5, !noalias !8
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI26]], ptr [[TMP4]], align 4, !alias.scope !12, !noalias !13
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI27]], ptr [[TMP5]], align 4, !alias.scope !12, !noalias !13
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI28]], ptr [[TMP6]], align 4, !alias.scope !14, !noalias !15
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI29]], ptr [[TMP7]], align 4, !alias.scope !14, !noalias !15
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI30]], ptr [[TMP8]], align 4, !alias.scope !15
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI31]], ptr [[TMP9]], align 4, !alias.scope !15
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -310,14 +286,14 @@ define void @test(i32* nocapture %asd, i32* nocapture %aud,
 ; UNROLL-NO-VF-NEXT:    ret void
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]]
-; UNROLL-NO-VF-NEXT:    [[IUD:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDVARS_IV]]
-; UNROLL-NO-VF-NEXT:    [[ISR:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDVARS_IV]]
-; UNROLL-NO-VF-NEXT:    [[IUR:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDVARS_IV]]
-; UNROLL-NO-VF-NEXT:    [[LSD:%.*]] = load i32, i32* [[ISD]], align 4
-; UNROLL-NO-VF-NEXT:    [[LUD:%.*]] = load i32, i32* [[IUD]], align 4
-; UNROLL-NO-VF-NEXT:    [[LSR:%.*]] = load i32, i32* [[ISR]], align 4
-; UNROLL-NO-VF-NEXT:    [[LUR:%.*]] = load i32, i32* [[IUR]], align 4
+; UNROLL-NO-VF-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]]
+; UNROLL-NO-VF-NEXT:    [[IUD:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[INDVARS_IV]]
+; UNROLL-NO-VF-NEXT:    [[ISR:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[INDVARS_IV]]
+; UNROLL-NO-VF-NEXT:    [[IUR:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[INDVARS_IV]]
+; UNROLL-NO-VF-NEXT:    [[LSD:%.*]] = load i32, ptr [[ISD]], align 4
+; UNROLL-NO-VF-NEXT:    [[LUD:%.*]] = load i32, ptr [[IUD]], align 4
+; UNROLL-NO-VF-NEXT:    [[LSR:%.*]] = load i32, ptr [[ISR]], align 4
+; UNROLL-NO-VF-NEXT:    [[LUR:%.*]] = load i32, ptr [[IUR]], align 4
 ; UNROLL-NO-VF-NEXT:    [[PSD:%.*]] = add nsw i32 [[LSD]], 23
 ; UNROLL-NO-VF-NEXT:    [[PUD:%.*]] = add nsw i32 [[LUD]], 24
 ; UNROLL-NO-VF-NEXT:    [[PSR:%.*]] = add nsw i32 [[LSR]], 25
@@ -335,15 +311,15 @@ define void @test(i32* nocapture %asd, i32* nocapture %aud,
 ; UNROLL-NO-VF-NEXT:    [[YUD_0:%.*]] = phi i32 [ [[RUD]], [[IF_THEN]] ], [ [[PUD]], [[FOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[YSR_0:%.*]] = phi i32 [ [[RSR]], [[IF_THEN]] ], [ [[PSR]], [[FOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[YUR_0:%.*]] = phi i32 [ [[RUR]], [[IF_THEN]] ], [ [[PUR]], [[FOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    store i32 [[YSD_0]], i32* [[ISD]], align 4
-; UNROLL-NO-VF-NEXT:    store i32 [[YUD_0]], i32* [[IUD]], align 4
-; UNROLL-NO-VF-NEXT:    store i32 [[YSR_0]], i32* [[ISR]], align 4
-; UNROLL-NO-VF-NEXT:    store i32 [[YUR_0]], i32* [[IUR]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 [[YSD_0]], ptr [[ISD]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 [[YUD_0]], ptr [[IUD]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 [[YSR_0]], ptr [[ISR]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 [[YUR_0]], ptr [[IUR]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; UNROLL-NO-VF-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
 ; UNROLL-NO-VF-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ;
-  i32* nocapture %asr, i32* nocapture %aur) {
+  ptr nocapture %asr, ptr nocapture %aur) {
 entry:
   br label %for.body
 
@@ -352,14 +328,14 @@ for.cond.cleanup:                                 ; preds = %if.end
 
 for.body:                                         ; preds = %if.end, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
-  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
-  %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv
-  %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv
-  %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv
-  %lsd = load i32, i32* %isd, align 4
-  %lud = load i32, i32* %iud, align 4
-  %lsr = load i32, i32* %isr, align 4
-  %lur = load i32, i32* %iur, align 4
+  %isd = getelementptr inbounds i32, ptr %asd, i64 %indvars.iv
+  %iud = getelementptr inbounds i32, ptr %aud, i64 %indvars.iv
+  %isr = getelementptr inbounds i32, ptr %asr, i64 %indvars.iv
+  %iur = getelementptr inbounds i32, ptr %aur, i64 %indvars.iv
+  %lsd = load i32, ptr %isd, align 4
+  %lud = load i32, ptr %iud, align 4
+  %lsr = load i32, ptr %isr, align 4
+  %lur = load i32, ptr %iur, align 4
   %psd = add nsw i32 %lsd, 23
   %pud = add nsw i32 %lud, 24
   %psr = add nsw i32 %lsr, 25
@@ -379,75 +355,68 @@ if.end:                                           ; preds = %if.then, %for.body
   %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ]
   %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ]
   %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ]
-  store i32 %ysd.0, i32* %isd, align 4
-  store i32 %yud.0, i32* %iud, align 4
-  store i32 %ysr.0, i32* %isr, align 4
-  store i32 %yur.0, i32* %iur, align 4
+  store i32 %ysd.0, ptr %isd, align 4
+  store i32 %yud.0, ptr %iud, align 4
+  store i32 %ysr.0, ptr %isr, align 4
+  store i32 %yur.0, ptr %iur, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 128
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) {
+define void @test_scalar2scalar(ptr nocapture %asd, ptr nocapture %bsd) {
 ; CHECK-LABEL: @test_scalar2scalar(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8*
-; CHECK-NEXT:    [[BSD3:%.*]] = bitcast i32* [[BSD:%.*]] to i8*
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[BSD]], i64 128
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[BSD3]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[BSD:%.*]], i64 512
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[BSD]], [[UGLYGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE8:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4, !alias.scope !19, !noalias !22
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4, !alias.scope !22
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], <i32 23, i32 23>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], <i32 100, i32 100>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4, !alias.scope !19, !noalias !22
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4, !alias.scope !22
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], <i32 23, i32 23>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], <i32 100, i32 100>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; CHECK:       pred.sdiv.if:
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = sdiv i32 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD6]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = sdiv i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sdiv i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
 ; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; CHECK:       pred.sdiv.continue:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP12]], [[PRED_SDIV_IF]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP15]], [[PRED_SDIV_IF]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8]]
-; CHECK:       pred.sdiv.if7:
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = sdiv i32 [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[WIDE_LOAD6]], i32 1
-; CHECK-NEXT:    [[TMP23:%.*]] = sdiv i32 [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP23]], i32 1
-; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE8]]
-; CHECK:       pred.sdiv.continue8:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP21]], [[PRED_SDIV_IF7]] ]
-; CHECK-NEXT:    [[TMP26:%.*]] = phi <2 x i32> [ [[TMP17]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP24]], [[PRED_SDIV_IF7]] ]
-; CHECK-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[TMP8]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[TMP7]], <2 x i32> [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], <2 x i32>* [[TMP29]], align 4, !alias.scope !19, !noalias !22
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4]]
+; CHECK:       pred.sdiv.if3:
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = sdiv i32 [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP21]], i32 1
+; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE4]]
+; CHECK:       pred.sdiv.continue4:
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP19]], [[PRED_SDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = phi <2 x i32> [ [[TMP15]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP22]], [[PRED_SDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <2 x i1> [[TMP6]], <i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[TMP5]], <2 x i32> [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP26]], align 4, !alias.scope !19, !noalias !22
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -458,10 +427,10 @@ define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) {
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LSD:%.*]] = load i32, i32* [[ISD]], align 4
-; CHECK-NEXT:    [[ISD_B:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LSD_B:%.*]] = load i32, i32* [[ISD_B]], align 4
+; CHECK-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[LSD:%.*]] = load i32, ptr [[ISD]], align 4
+; CHECK-NEXT:    [[ISD_B:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[LSD_B:%.*]] = load i32, ptr [[ISD_B]], align 4
 ; CHECK-NEXT:    [[PSD:%.*]] = add nsw i32 [[LSD]], 23
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[LSD]], 100
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END]]
@@ -471,68 +440,64 @@ define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) {
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[YSD_0:%.*]] = phi i32 [ [[RSD]], [[IF_THEN]] ], [ [[PSD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    store i32 [[YSD_0]], i32* [[ISD]], align 4
+; CHECK-NEXT:    store i32 [[YSD_0]], ptr [[ISD]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ;
 ; UNROLL-NO-VF-LABEL: @test_scalar2scalar(
 ; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8*
-; UNROLL-NO-VF-NEXT:    [[BSD3:%.*]] = bitcast i32* [[BSD:%.*]] to i8*
 ; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; UNROLL-NO-VF:       vector.memcheck:
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[BSD]], i64 128
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; UNROLL-NO-VF-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]]
-; UNROLL-NO-VF-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[BSD3]], [[SCEVGEP2]]
+; UNROLL-NO-VF-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512
+; UNROLL-NO-VF-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[BSD:%.*]], i64 512
+; UNROLL-NO-VF-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]]
+; UNROLL-NO-VF-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[BSD]], [[UGLYGEP]]
 ; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE8:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NO-VF-NEXT:    [[INDUCTION6:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION6]]
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4, !alias.scope !19, !noalias !22
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !19, !noalias !22
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add nsw i32 [[TMP2]], 23
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP3]], 23
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[TMP2]], 100
-; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = icmp slt i32 [[TMP3]], 100
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP6]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE3:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]]
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !alias.scope !19, !noalias !22
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope !19, !noalias !22
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP4]], 23
+; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = add nsw i32 [[TMP5]], 23
+; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = icmp slt i32 [[TMP4]], 100
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = icmp slt i32 [[TMP5]], 100
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP8]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; UNROLL-NO-VF:       pred.sdiv.if:
-; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !alias.scope !22
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = sdiv i32 [[TMP4]], [[TMP2]]
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = sdiv i32 [[TMP9]], [[TMP10]]
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP0]]
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope !22
+; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = sdiv i32 [[TMP6]], [[TMP4]]
+; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = sdiv i32 [[TMP11]], [[TMP12]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.sdiv.continue:
-; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_SDIV_IF]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_SDIV_IF]] ]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP7]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8]]
-; UNROLL-NO-VF:       pred.sdiv.if6:
-; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION6]]
-; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !alias.scope !22
-; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = sdiv i32 [[TMP5]], [[TMP3]]
-; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = sdiv i32 [[TMP15]], [[TMP16]]
-; UNROLL-NO-VF-NEXT:    br label [[PRED_SDIV_CONTINUE8]]
-; UNROLL-NO-VF:       pred.sdiv.continue7:
-; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP16]], [[PRED_SDIV_IF7]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_SDIV_IF7]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP6]], true
-; UNROLL-NO-VF-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP7]], true
-; UNROLL-NO-VF-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP20]], i32 [[TMP4]], i32 [[TMP13]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI9:%.*]] = select i1 [[TMP21]], i32 [[TMP5]], i32 [[TMP19]]
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI]], i32* [[TMP0]], align 4, !alias.scope !19, !noalias !22
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI9]], i32* [[TMP1]], align 4, !alias.scope !19, !noalias !22
+; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP12]], [[PRED_SDIV_IF]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_SDIV_IF]] ]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP9]], label [[PRED_SDIV_IF2:%.*]], label [[PRED_SDIV_CONTINUE3]]
+; UNROLL-NO-VF:       pred.sdiv.if2:
+; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !alias.scope !22
+; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = sdiv i32 [[TMP7]], [[TMP5]]
+; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = sdiv i32 [[TMP17]], [[TMP18]]
+; UNROLL-NO-VF-NEXT:    br label [[PRED_SDIV_CONTINUE3]]
+; UNROLL-NO-VF:       pred.sdiv.continue3:
+; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_SDIV_IF2]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP21:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP19]], [[PRED_SDIV_IF2]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP8]], true
+; UNROLL-NO-VF-NEXT:    [[TMP23:%.*]] = xor i1 [[TMP9]], true
+; UNROLL-NO-VF-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP22]], i32 [[TMP6]], i32 [[TMP15]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI4:%.*]] = select i1 [[TMP23]], i32 [[TMP7]], i32 [[TMP21]]
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI]], ptr [[TMP2]], align 4, !alias.scope !19, !noalias !22
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI4]], ptr [[TMP3]], align 4, !alias.scope !19, !noalias !22
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -543,10 +508,10 @@ define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) {
 ; UNROLL-NO-VF-NEXT:    ret void
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]]
-; UNROLL-NO-VF-NEXT:    [[LSD:%.*]] = load i32, i32* [[ISD]], align 4
-; UNROLL-NO-VF-NEXT:    [[ISD_B:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDVARS_IV]]
-; UNROLL-NO-VF-NEXT:    [[LSD_B:%.*]] = load i32, i32* [[ISD_B]], align 4
+; UNROLL-NO-VF-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]]
+; UNROLL-NO-VF-NEXT:    [[LSD:%.*]] = load i32, ptr [[ISD]], align 4
+; UNROLL-NO-VF-NEXT:    [[ISD_B:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[INDVARS_IV]]
+; UNROLL-NO-VF-NEXT:    [[LSD_B:%.*]] = load i32, ptr [[ISD_B]], align 4
 ; UNROLL-NO-VF-NEXT:    [[PSD:%.*]] = add nsw i32 [[LSD]], 23
 ; UNROLL-NO-VF-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[LSD]], 100
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END]]
@@ -556,7 +521,7 @@ define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) {
 ; UNROLL-NO-VF-NEXT:    br label [[IF_END]]
 ; UNROLL-NO-VF:       if.end:
 ; UNROLL-NO-VF-NEXT:    [[YSD_0:%.*]] = phi i32 [ [[RSD]], [[IF_THEN]] ], [ [[PSD]], [[FOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    store i32 [[YSD_0]], i32* [[ISD]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 [[YSD_0]], ptr [[ISD]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; UNROLL-NO-VF-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
 ; UNROLL-NO-VF-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
@@ -570,10 +535,10 @@ for.cond.cleanup:                                 ; preds = %if.end
 
 for.body:                                         ; preds = %if.end, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
-  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
-  %lsd = load i32, i32* %isd, align 4
-  %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv
-  %lsd.b = load i32, i32* %isd.b, align 4
+  %isd = getelementptr inbounds i32, ptr %asd, i64 %indvars.iv
+  %lsd = load i32, ptr %isd, align 4
+  %isd.b = getelementptr inbounds i32, ptr %bsd, i64 %indvars.iv
+  %lsd.b = load i32, ptr %isd.b, align 4
   %psd = add nsw i32 %lsd, 23
   %cmp1 = icmp slt i32 %lsd, 100
   br i1 %cmp1, label %if.then, label %if.end
@@ -585,77 +550,70 @@ if.then:                                          ; preds = %for.body
 
 if.end:                                           ; preds = %if.then, %for.body
   %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
-  store i32 %ysd.0, i32* %isd, align 4
+  store i32 %ysd.0, ptr %isd, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 128
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {;
+define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {;
 ; CHECK-LABEL: @pr30172(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8*
-; CHECK-NEXT:    [[BSD3:%.*]] = bitcast i32* [[BSD:%.*]] to i8*
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[BSD]], i64 128
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[BSD3]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[BSD:%.*]], i64 512
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[BSD]], [[UGLYGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE8:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4, !alias.scope !28, !noalias !31
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4, !alias.scope !31
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], <i32 23, i32 23>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], <i32 100, i32 100>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], <i32 200, i32 200>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP8]], <i1 true, i1 true>, !dbg [[DBG33:![0-9]+]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP10]], <2 x i1> [[TMP9]], <2 x i1> zeroinitializer, !dbg [[DBG34:![0-9]+]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <2 x i1> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4, !alias.scope !28, !noalias !31
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4, !alias.scope !31
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], <i32 23, i32 23>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], <i32 100, i32 100>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], <i32 200, i32 200>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP6]], <i1 true, i1 true>, !dbg [[DBG33:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i1> [[TMP7]], <2 x i1> zeroinitializer, !dbg [[DBG34:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <2 x i1> [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; CHECK:       pred.sdiv.if:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = sdiv i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[WIDE_LOAD6]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = sdiv i32 [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sdiv i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sdiv i32 [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i32 0
 ; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; CHECK:       pred.sdiv.continue:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP16]], [[PRED_SDIV_IF]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_SDIV_IF]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
-; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8]]
-; CHECK:       pred.sdiv.if7:
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP25:%.*]] = sdiv i32 [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i32> [[WIDE_LOAD6]], i32 1
-; CHECK-NEXT:    [[TMP27:%.*]] = sdiv i32 [[TMP26]], [[TMP25]]
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP27]], i32 1
-; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE8]]
-; CHECK:       pred.sdiv.continue8:
-; CHECK-NEXT:    [[TMP29:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP25]], [[PRED_SDIV_IF7]] ]
-; CHECK-NEXT:    [[TMP30:%.*]] = phi <2 x i32> [ [[TMP21]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP28]], [[PRED_SDIV_IF7]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = xor <2 x i1> [[TMP9]], <i1 true, i1 true>, !dbg [[DBG34]]
-; CHECK-NEXT:    [[TMP32:%.*]] = select <2 x i1> [[TMP10]], <2 x i1> [[TMP31]], <2 x i1> zeroinitializer, !dbg [[DBG34]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP32]], <2 x i32> [[TMP7]], <2 x i32> [[TMP30]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], <2 x i32>* [[TMP34]], align 4, !alias.scope !28, !noalias !31
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP17]], [[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4]]
+; CHECK:       pred.sdiv.if3:
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = sdiv i32 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1
+; CHECK-NEXT:    [[TMP25:%.*]] = sdiv i32 [[TMP24]], [[TMP23]]
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP25]], i32 1
+; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE4]]
+; CHECK:       pred.sdiv.continue4:
+; CHECK-NEXT:    [[TMP27:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP23]], [[PRED_SDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = phi <2 x i32> [ [[TMP19]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP26]], [[PRED_SDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>, !dbg [[DBG34]]
+; CHECK-NEXT:    [[TMP30:%.*]] = select <2 x i1> [[TMP8]], <2 x i1> [[TMP29]], <2 x i1> zeroinitializer, !dbg [[DBG34]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[TMP5]], <2 x i32> [[TMP28]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4, !alias.scope !28, !noalias !31
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -666,10 +624,10 @@ define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {;
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LSD:%.*]] = load i32, i32* [[ISD]], align 4
-; CHECK-NEXT:    [[ISD_B:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LSD_B:%.*]] = load i32, i32* [[ISD_B]], align 4
+; CHECK-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[LSD:%.*]] = load i32, ptr [[ISD]], align 4
+; CHECK-NEXT:    [[ISD_B:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[LSD_B:%.*]] = load i32, ptr [[ISD_B]], align 4
 ; CHECK-NEXT:    [[PSD:%.*]] = add nsw i32 [[LSD]], 23
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[LSD]], 100
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[LSD]], 200
@@ -681,78 +639,74 @@ define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {;
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[YSD_0:%.*]] = phi i32 [ [[RSD]], [[IF_THEN]] ], [ [[PSD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    store i32 [[YSD_0]], i32* [[ISD]], align 4
+; CHECK-NEXT:    store i32 [[YSD_0]], ptr [[ISD]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ;
 ; UNROLL-NO-VF-LABEL: @pr30172(
 ; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8*
-; UNROLL-NO-VF-NEXT:    [[BSD3:%.*]] = bitcast i32* [[BSD:%.*]] to i8*
 ; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; UNROLL-NO-VF:       vector.memcheck:
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[BSD]], i64 128
-; UNROLL-NO-VF-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; UNROLL-NO-VF-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]]
-; UNROLL-NO-VF-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[BSD3]], [[SCEVGEP2]]
+; UNROLL-NO-VF-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ASD:%.*]], i64 512
+; UNROLL-NO-VF-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[BSD:%.*]], i64 512
+; UNROLL-NO-VF-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[ASD]], [[UGLYGEP1]]
+; UNROLL-NO-VF-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[BSD]], [[UGLYGEP]]
 ; UNROLL-NO-VF-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE8:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NO-VF-NEXT:    [[INDUCTION6:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDUCTION6]]
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4, !alias.scope !28, !noalias !31
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !28, !noalias !31
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add nsw i32 [[TMP2]], 23
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP3]], 23
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[TMP2]], 100
-; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = icmp slt i32 [[TMP3]], 100
-; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = icmp sge i32 [[TMP2]], 200
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = icmp sge i32 [[TMP3]], 200
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = xor i1 [[TMP6]], true, !dbg [[DBG33:![0-9]+]]
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = xor i1 [[TMP7]], true, !dbg [[DBG33]]
-; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP10]], i1 [[TMP8]], i1 false, !dbg [[DBG34:![0-9]+]]
-; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = select i1 [[TMP11]], i1 [[TMP9]], i1 false, !dbg [[DBG34]]
-; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP6]]
-; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = or i1 [[TMP13]], [[TMP7]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP14]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE3:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP0]]
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !alias.scope !28, !noalias !31
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope !28, !noalias !31
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP4]], 23
+; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = add nsw i32 [[TMP5]], 23
+; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = icmp slt i32 [[TMP4]], 100
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = icmp slt i32 [[TMP5]], 100
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = icmp sge i32 [[TMP4]], 200
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp sge i32 [[TMP5]], 200
+; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP8]], true, !dbg [[DBG33:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP9]], true, !dbg [[DBG33]]
+; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = select i1 [[TMP12]], i1 [[TMP10]], i1 false, !dbg [[DBG34:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], i1 [[TMP11]], i1 false, !dbg [[DBG34]]
+; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP8]]
+; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP9]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP16]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; UNROLL-NO-VF:       pred.sdiv.if:
-; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !alias.scope !31
-; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = sdiv i32 [[TMP4]], [[TMP2]]
-; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = sdiv i32 [[TMP17]], [[TMP18]]
+; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP0]]
+; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !alias.scope !31
+; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = sdiv i32 [[TMP6]], [[TMP4]]
+; UNROLL-NO-VF-NEXT:    [[TMP21:%.*]] = sdiv i32 [[TMP19]], [[TMP20]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.sdiv.continue:
-; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP18]], [[PRED_SDIV_IF]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP21:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_SDIV_IF]] ]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP15]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8]]
-; UNROLL-NO-VF:       pred.sdiv.if6:
-; UNROLL-NO-VF-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDUCTION6]]
-; UNROLL-NO-VF-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !alias.scope !31
-; UNROLL-NO-VF-NEXT:    [[TMP24:%.*]] = sdiv i32 [[TMP5]], [[TMP3]]
-; UNROLL-NO-VF-NEXT:    [[TMP25:%.*]] = sdiv i32 [[TMP23]], [[TMP24]]
-; UNROLL-NO-VF-NEXT:    br label [[PRED_SDIV_CONTINUE8]]
-; UNROLL-NO-VF:       pred.sdiv.continue7:
-; UNROLL-NO-VF-NEXT:    [[TMP26:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP24]], [[PRED_SDIV_IF7]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP27:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP25]], [[PRED_SDIV_IF7]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP28:%.*]] = xor i1 [[TMP8]], true, !dbg [[DBG34]]
-; UNROLL-NO-VF-NEXT:    [[TMP29:%.*]] = xor i1 [[TMP9]], true, !dbg [[DBG34]]
-; UNROLL-NO-VF-NEXT:    [[TMP30:%.*]] = select i1 [[TMP10]], i1 [[TMP28]], i1 false, !dbg [[DBG34]]
-; UNROLL-NO-VF-NEXT:    [[TMP31:%.*]] = select i1 [[TMP11]], i1 [[TMP29]], i1 false, !dbg [[DBG34]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP30]], i32 [[TMP4]], i32 [[TMP21]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI9:%.*]] = select i1 [[TMP31]], i32 [[TMP5]], i32 [[TMP27]]
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI]], i32* [[TMP0]], align 4, !alias.scope !28, !noalias !31
-; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI9]], i32* [[TMP1]], align 4, !alias.scope !28, !noalias !31
+; UNROLL-NO-VF-NEXT:    [[TMP22:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_SDIV_IF]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP23:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP21]], [[PRED_SDIV_IF]] ]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP17]], label [[PRED_SDIV_IF2:%.*]], label [[PRED_SDIV_CONTINUE3]]
+; UNROLL-NO-VF:       pred.sdiv.if2:
+; UNROLL-NO-VF-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4, !alias.scope !31
+; UNROLL-NO-VF-NEXT:    [[TMP26:%.*]] = sdiv i32 [[TMP7]], [[TMP5]]
+; UNROLL-NO-VF-NEXT:    [[TMP27:%.*]] = sdiv i32 [[TMP25]], [[TMP26]]
+; UNROLL-NO-VF-NEXT:    br label [[PRED_SDIV_CONTINUE3]]
+; UNROLL-NO-VF:       pred.sdiv.continue3:
+; UNROLL-NO-VF-NEXT:    [[TMP28:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP26]], [[PRED_SDIV_IF2]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP29:%.*]] = phi i32 [ poison, [[PRED_SDIV_CONTINUE]] ], [ [[TMP27]], [[PRED_SDIV_IF2]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP30:%.*]] = xor i1 [[TMP10]], true, !dbg [[DBG34]]
+; UNROLL-NO-VF-NEXT:    [[TMP31:%.*]] = xor i1 [[TMP11]], true, !dbg [[DBG34]]
+; UNROLL-NO-VF-NEXT:    [[TMP32:%.*]] = select i1 [[TMP12]], i1 [[TMP30]], i1 false, !dbg [[DBG34]]
+; UNROLL-NO-VF-NEXT:    [[TMP33:%.*]] = select i1 [[TMP13]], i1 [[TMP31]], i1 false, !dbg [[DBG34]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP32]], i32 [[TMP6]], i32 [[TMP23]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI4:%.*]] = select i1 [[TMP33]], i32 [[TMP7]], i32 [[TMP29]]
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI]], ptr [[TMP2]], align 4, !alias.scope !28, !noalias !31
+; UNROLL-NO-VF-NEXT:    store i32 [[PREDPHI4]], ptr [[TMP3]], align 4, !alias.scope !28, !noalias !31
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -763,10 +717,10 @@ define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {;
 ; UNROLL-NO-VF-NEXT:    ret void
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]]
-; UNROLL-NO-VF-NEXT:    [[LSD:%.*]] = load i32, i32* [[ISD]], align 4
-; UNROLL-NO-VF-NEXT:    [[ISD_B:%.*]] = getelementptr inbounds i32, i32* [[BSD]], i64 [[INDVARS_IV]]
-; UNROLL-NO-VF-NEXT:    [[LSD_B:%.*]] = load i32, i32* [[ISD_B]], align 4
+; UNROLL-NO-VF-NEXT:    [[ISD:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDVARS_IV]]
+; UNROLL-NO-VF-NEXT:    [[LSD:%.*]] = load i32, ptr [[ISD]], align 4
+; UNROLL-NO-VF-NEXT:    [[ISD_B:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[INDVARS_IV]]
+; UNROLL-NO-VF-NEXT:    [[LSD_B:%.*]] = load i32, ptr [[ISD_B]], align 4
 ; UNROLL-NO-VF-NEXT:    [[PSD:%.*]] = add nsw i32 [[LSD]], 23
 ; UNROLL-NO-VF-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[LSD]], 100
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[CHECKBB:%.*]], !dbg [[DBG33]]
@@ -779,7 +733,7 @@ define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) !dbg !5 {;
 ; UNROLL-NO-VF-NEXT:    br label [[IF_END]]
 ; UNROLL-NO-VF:       if.end:
 ; UNROLL-NO-VF-NEXT:    [[YSD_0:%.*]] = phi i32 [ [[RSD]], [[IF_THEN]] ], [ [[PSD]], [[CHECKBB]] ]
-; UNROLL-NO-VF-NEXT:    store i32 [[YSD_0]], i32* [[ISD]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 [[YSD_0]], ptr [[ISD]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; UNROLL-NO-VF-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
 ; UNROLL-NO-VF-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
@@ -792,10 +746,10 @@ for.cond.cleanup:                                 ; preds = %if.end
 
 for.body:                                         ; preds = %if.end, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
-  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
-  %lsd = load i32, i32* %isd, align 4
-  %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv
-  %lsd.b = load i32, i32* %isd.b, align 4
+  %isd = getelementptr inbounds i32, ptr %asd, i64 %indvars.iv
+  %lsd = load i32, ptr %isd, align 4
+  %isd.b = getelementptr inbounds i32, ptr %bsd, i64 %indvars.iv
+  %lsd.b = load i32, ptr %isd.b, align 4
   %psd = add nsw i32 %lsd, 23
   %cmp1 = icmp slt i32 %lsd, 100
   br i1 %cmp1, label %if.then, label %checkbb, !dbg !7
@@ -811,13 +765,13 @@ if.then:                                          ; preds = %checkbb, %for.body
 
 if.end:                                           ; preds = %if.then, %checkbb
   %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %checkbb ]
-  store i32 %ysd.0, i32* %isd, align 4
+  store i32 %ysd.0, ptr %isd, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 128
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-LABEL: @predicated_udiv_scalarized_operand(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
@@ -831,53 +785,52 @@ define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[PRED_UDIV_CONTINUE2]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP5]], [[X:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; CHECK:       pred.udiv.continue:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UDIV_IF]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.if1:
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], [[X]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = udiv i32 [[TMP14]], [[TMP13]]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP11]], [[X]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = udiv i32 [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP14]], i32 1
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.continue2:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP16]], [[PRED_UDIV_IF1]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP17]], <2 x i32> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP19]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP19]])
+; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP18]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[T6:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
-; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* [[T0]], align 4
+; CHECK-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[T2:%.*]] = load i32, ptr [[T0]], align 4
 ; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[T3:%.*]] = add nsw i32 [[T2]], [[X]]
@@ -890,7 +843,7 @@ define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP38:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[T7]]
 ;
 ; UNROLL-NO-VF-LABEL: @predicated_udiv_scalarized_operand(
@@ -903,40 +856,40 @@ define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[PRED_UDIV_CONTINUE4]] ]
-; UNROLL-NO-VF-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_UDIV_CONTINUE4]] ]
-; UNROLL-NO-VF-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NO-VF-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION2]]
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE3:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_UDIV_CONTINUE3]] ]
+; UNROLL-NO-VF-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_UDIV_CONTINUE3]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
 ; UNROLL-NO-VF-NEXT:    br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL-NO-VF:       pred.udiv.if:
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add nsw i32 [[TMP2]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP2]], [[TMP4]]
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = udiv i32 [[TMP4]], [[TMP6]]
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.udiv.continue:
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ]
-; UNROLL-NO-VF-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ]
+; UNROLL-NO-VF-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF2:%.*]], label [[PRED_UDIV_CONTINUE3]]
 ; UNROLL-NO-VF:       pred.udiv.if2:
-; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = add nsw i32 [[TMP3]], [[X]]
-; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP3]], [[TMP7]]
-; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP5]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = udiv i32 [[TMP5]], [[TMP9]]
+; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE3]]
 ; UNROLL-NO-VF:       pred.udiv.continue3:
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = xor i1 [[C]], true
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = xor i1 [[C]], true
-; UNROLL-NO-VF-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], i32 [[TMP6]], i32 [[TMP2]]
-; UNROLL-NO-VF-NEXT:    [[PREDPHI5:%.*]] = select i1 [[C]], i32 [[TMP9]], i32 [[TMP3]]
-; UNROLL-NO-VF-NEXT:    [[TMP12]] = add i32 [[VEC_PHI]], [[PREDPHI]]
-; UNROLL-NO-VF-NEXT:    [[TMP13]] = add i32 [[VEC_PHI1]], [[PREDPHI5]]
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF2]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = xor i1 [[C]], true
+; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = xor i1 [[C]], true
+; UNROLL-NO-VF-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], i32 [[TMP8]], i32 [[TMP4]]
+; UNROLL-NO-VF-NEXT:    [[PREDPHI4:%.*]] = select i1 [[C]], i32 [[TMP11]], i32 [[TMP5]]
+; UNROLL-NO-VF-NEXT:    [[TMP14]] = add i32 [[VEC_PHI]], [[PREDPHI]]
+; UNROLL-NO-VF-NEXT:    [[TMP15]] = add i32 [[VEC_PHI1]], [[PREDPHI4]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]]
+; UNROLL-NO-VF-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP15]], [[TMP14]]
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -946,8 +899,8 @@ define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[T6:%.*]], [[FOR_INC]] ]
-; UNROLL-NO-VF-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
-; UNROLL-NO-VF-NEXT:    [[T2:%.*]] = load i32, i32* [[T0]], align 4
+; UNROLL-NO-VF-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; UNROLL-NO-VF-NEXT:    [[T2:%.*]] = load i32, ptr [[T0]], align 4
 ; UNROLL-NO-VF-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; UNROLL-NO-VF:       if.then:
 ; UNROLL-NO-VF-NEXT:    [[T3:%.*]] = add nsw i32 [[T2]], [[X]]
@@ -972,8 +925,8 @@ entry:
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
   %r = phi i32 [ 0, %entry ], [ %t6, %for.inc ]
-  %t0 = getelementptr inbounds i32, i32* %a, i64 %i
-  %t2 = load i32, i32* %t0, align 4
+  %t0 = getelementptr inbounds i32, ptr %a, i64 %i
+  %t2 = load i32, ptr %t0, align 4
   br i1 %c, label %if.then, label %for.inc
 
 if.then:

diff  --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index 6cb0bc662398..a742e28dea71 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -6,47 +6,47 @@
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Test predication of stores.
-define i32 @test(i32* nocapture %f) #0 {
+define i32 @test(ptr nocapture %f) #0 {
 ; UNROLL-LABEL: @test(
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
-; UNROLL-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[INDUCTION]]
-; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDUCTION1]]
-; UNROLL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
-; UNROLL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
-; UNROLL-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 100
-; UNROLL-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 100
-; UNROLL-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; UNROLL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[F:%.*]], i64 [[TMP0]]
+; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[TMP1]]
+; UNROLL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
+; UNROLL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
+; UNROLL-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[TMP4]], 100
+; UNROLL-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], 100
+; UNROLL-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL:       pred.store.if:
-; UNROLL-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP2]], 20
-; UNROLL-NEXT:    store i32 [[TMP6]], i32* [[TMP0]], align 4
+; UNROLL-NEXT:    [[TMP8:%.*]] = add nsw i32 [[TMP4]], 20
+; UNROLL-NEXT:    store i32 [[TMP8]], ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL:       pred.store.continue:
-; UNROLL-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
+; UNROLL-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; UNROLL:       pred.store.if1:
-; UNROLL-NEXT:    [[TMP7:%.*]] = add nsw i32 [[TMP3]], 20
-; UNROLL-NEXT:    store i32 [[TMP7]], i32* [[TMP1]], align 4
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; UNROLL-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP5]], 20
+; UNROLL-NEXT:    store i32 [[TMP9]], ptr [[TMP3]], align 4
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; UNROLL:       pred.store.continue2:
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; UNROLL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; UNROLL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; UNROLL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 128, [[MIDDLE_BLOCK]] ]
-; UNROLL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]]
-; UNROLL-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; UNROLL-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP9]], 100
+; UNROLL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]]
+; UNROLL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; UNROLL-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 100
 ; UNROLL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; UNROLL:       if.then:
-; UNROLL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], 20
-; UNROLL-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX]], align 4
+; UNROLL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], 20
+; UNROLL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
 ; UNROLL-NEXT:    br label [[FOR_INC]]
 ; UNROLL:       for.inc:
 ; UNROLL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -61,30 +61,30 @@ define i32 @test(i32* nocapture %f) #0 {
 ; UNROLL-NOSIMPLIFY:       vector.ph:
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[INDUCTION]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDUCTION1]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 100
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 100
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[F:%.*]], i64 [[TMP0]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[TMP1]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[TMP4]], 100
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], 100
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP2]], 20
-; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP6]], i32* [[TMP0]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = add nsw i32 [[TMP4]], 20
+; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP8]], ptr [[TMP2]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue:
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if1:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = add nsw i32 [[TMP3]], 20
-; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP7]], i32* [[TMP1]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP5]], 20
+; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP9]], ptr [[TMP3]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue2:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -93,13 +93,13 @@ define i32 @test(i32* nocapture %f) #0 {
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       for.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP9]], 100
+; UNROLL-NOSIMPLIFY-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 100
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; UNROLL-NOSIMPLIFY:       if.then:
-; UNROLL-NOSIMPLIFY-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], 20
-; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], 20
+; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC]]
 ; UNROLL-NOSIMPLIFY:       for.inc:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -114,45 +114,44 @@ define i32 @test(i32* nocapture %f) #0 {
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; VEC-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[TMP0]]
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; VEC-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; VEC-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], <i32 100, i32 100>
-; VEC-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; VEC-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[F:%.*]], i64 [[TMP0]]
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; VEC-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], <i32 100, i32 100>
+; VEC-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; VEC-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; VEC:       pred.store.if:
-; VEC-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; VEC-NEXT:    [[TMP7:%.*]] = add nsw i32 [[TMP6]], 20
-; VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP0]]
-; VEC-NEXT:    store i32 [[TMP7]], i32* [[TMP8]], align 4
+; VEC-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; VEC-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP5]], 20
+; VEC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[TMP0]]
+; VEC-NEXT:    store i32 [[TMP6]], ptr [[TMP7]], align 4
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; VEC:       pred.store.continue:
-; VEC-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; VEC-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; VEC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; VEC-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if1:
-; VEC-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
-; VEC-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; VEC-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP11]], 20
-; VEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP10]]
-; VEC-NEXT:    store i32 [[TMP12]], i32* [[TMP13]], align 4
+; VEC-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; VEC-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; VEC-NEXT:    [[TMP11:%.*]] = add nsw i32 [[TMP10]], 20
+; VEC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[TMP9]]
+; VEC-NEXT:    store i32 [[TMP11]], ptr [[TMP12]], align 4
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.continue2:
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VEC-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; VEC-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VEC-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; VEC-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VEC:       middle.block:
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, 128
 ; VEC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; VEC:       for.body:
 ; VEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 128, [[MIDDLE_BLOCK]] ]
-; VEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]]
-; VEC-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; VEC-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP15]], 100
+; VEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]]
+; VEC-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; VEC-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP14]], 100
 ; VEC-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; VEC:       if.then:
-; VEC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], 20
-; VEC-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX]], align 4
+; VEC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], 20
+; VEC-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
 ; VEC-NEXT:    br label [[FOR_INC]]
 ; VEC:       for.inc:
 ; VEC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -168,14 +167,14 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %f, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
   %cmp1 = icmp sgt i32 %0, 100
   br i1 %cmp1, label %if.then, label %for.inc
 
 if.then:
   %add = add nsw i32 %0, 20
-  store i32 %add, i32* %arrayidx, align 4
+  store i32 %add, ptr %arrayidx, align 4
   br label %for.inc
 
 for.inc:
@@ -192,7 +191,7 @@ for.end:
 ; vectorized loop body.
 ; PR18724
 
-define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
+define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-LABEL: @bug18724(
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    [[TMP0:%.*]] = xor i1 [[COND:%.*]], true
@@ -210,36 +209,36 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
-; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE4]] ]
-; UNROLL-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
+; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE3]] ]
+; UNROLL-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI4:%.*]], [[PRED_STORE_CONTINUE3]] ]
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]]
-; UNROLL-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; UNROLL:       pred.store.if:
-; UNROLL-NEXT:    [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
-; UNROLL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[INDUCTION]]
-; UNROLL-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
-; UNROLL-NEXT:    store i32 [[TMP6]], i32* [[TMP5]], align 4
-; UNROLL-NEXT:    [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1
-; UNROLL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION1]]
-; UNROLL-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
-; UNROLL-NEXT:    store i32 [[TMP8]], i32* [[TMP7]], align 4
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[TMP5]]
+; UNROLL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; UNROLL-NEXT:    store i32 [[TMP7]], ptr [[TMP6]], align 4
+; UNROLL-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 1
+; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP8]]
+; UNROLL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; UNROLL-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; UNROLL:       pred.store.continue3:
-; UNROLL-NEXT:    [[TMP9:%.*]] = add i32 [[VEC_PHI]], 1
-; UNROLL-NEXT:    [[TMP10:%.*]] = add i32 [[VEC_PHI2]], 1
-; UNROLL-NEXT:    [[TMP11:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NEXT:    [[PREDPHI]] = select i1 [[TMP11]], i32 [[VEC_PHI]], i32 [[TMP9]]
-; UNROLL-NEXT:    [[PREDPHI5]] = select i1 [[TMP12]], i32 [[VEC_PHI2]], i32 [[TMP10]]
+; UNROLL-NEXT:    [[TMP11:%.*]] = add i32 [[VEC_PHI]], 1
+; UNROLL-NEXT:    [[TMP12:%.*]] = add i32 [[VEC_PHI1]], 1
+; UNROLL-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2]], true
+; UNROLL-NEXT:    [[TMP14:%.*]] = xor i1 [[COND_2]], true
+; UNROLL-NEXT:    [[PREDPHI]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]]
+; UNROLL-NEXT:    [[PREDPHI4]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; UNROLL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]]
+; UNROLL-NEXT:    [[BIN_RDX:%.*]] = add i32 [[PREDPHI4]], [[PREDPHI]]
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
-; UNROLL-NEXT:    [[TMP14:%.*]] = xor i1 [[CMP_N]], true
-; UNROLL-NEXT:    call void @llvm.assume(i1 [[TMP14]])
+; UNROLL-NEXT:    [[TMP16:%.*]] = xor i1 [[CMP_N]], true
+; UNROLL-NEXT:    call void @llvm.assume(i1 [[TMP16]])
 ; UNROLL-NEXT:    br label [[SCALAR_PH]]
 ; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[V_1]], [[ENTRY:%.*]] ]
@@ -248,11 +247,11 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.
 ; UNROLL:       for.body14:
 ; UNROLL-NEXT:    [[INDVARS_IV3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_INC23:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[INEWCHUNKS_120:%.*]] = phi i32 [ [[INEWCHUNKS_2:%.*]], [[FOR_INC23]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDVARS_IV3]]
-; UNROLL-NEXT:    [[TMP:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4
+; UNROLL-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[INDVARS_IV3]]
+; UNROLL-NEXT:    [[TMP:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4
 ; UNROLL-NEXT:    br i1 [[COND_2]], label [[IF_THEN18:%.*]], label [[FOR_INC23]]
 ; UNROLL:       if.then18:
-; UNROLL-NEXT:    store i32 [[TMP]], i32* [[ARRAYIDX16]], align 4
+; UNROLL-NEXT:    store i32 [[TMP]], ptr [[ARRAYIDX16]], align 4
 ; UNROLL-NEXT:    [[INC21:%.*]] = add nsw i32 [[INEWCHUNKS_120]], 1
 ; UNROLL-NEXT:    br label [[FOR_INC23]]
 ; UNROLL:       for.inc23:
@@ -282,37 +281,37 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.
 ; UNROLL-NOSIMPLIFY-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE4]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE3]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI4:%.*]], [[PRED_STORE_CONTINUE3]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]]
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[INDUCTION]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP5]], i32* [[TMP4]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[TMP4]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP6]], ptr [[TMP5]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue:
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if2:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION1]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP7]], i32* [[TMP6]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP7]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue3:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = add i32 [[VEC_PHI]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = add i32 [[VEC_PHI2]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP11:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI]] = select i1 [[TMP10]], i32 [[VEC_PHI]], i32 [[TMP8]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI5]] = select i1 [[TMP11]], i32 [[VEC_PHI2]], i32 [[TMP9]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = add i32 [[VEC_PHI]], 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP11:%.*]] = add i32 [[VEC_PHI1]], 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2]], true
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2]], true
+; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI]] = select i1 [[TMP12]], i32 [[VEC_PHI]], i32 [[TMP10]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP11]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
-; UNROLL-NOSIMPLIFY-NEXT:    [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[BIN_RDX:%.*]] = add i32 [[PREDPHI4]], [[PREDPHI]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[CMP_N]], label [[FOR_INC26_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NOSIMPLIFY:       scalar.ph:
@@ -322,11 +321,11 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.
 ; UNROLL-NOSIMPLIFY:       for.body14:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDVARS_IV3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_INC23:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_120:%.*]] = phi i32 [ [[INEWCHUNKS_2:%.*]], [[FOR_INC23]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDVARS_IV3]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[INDVARS_IV3]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2]], label [[IF_THEN18:%.*]], label [[FOR_INC23]]
 ; UNROLL-NOSIMPLIFY:       if.then18:
-; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP]], i32* [[ARRAYIDX16]], align 4
+; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP]], ptr [[ARRAYIDX16]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INC21:%.*]] = add nsw i32 [[INEWCHUNKS_120]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC23]]
 ; UNROLL-NOSIMPLIFY:       for.inc23:
@@ -366,51 +365,50 @@ define void @bug18724(i1 %cond, [768 x i32]* %ptr, i1 %cond.2, i64 %v.1, i32 %v.
 ; VEC-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE2]] ]
 ; VEC-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]]
 ; VEC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
-; VEC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[TMP6]]
-; VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; VEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
-; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4
-; VEC-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; VEC-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; VEC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[TMP6]]
+; VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4
+; VEC-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
+; VEC-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; VEC:       pred.store.if:
-; VEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[TMP6]]
-; VEC-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; VEC-NEXT:    store i32 [[TMP12]], i32* [[TMP11]], align 4
+; VEC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP6]]
+; VEC-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; VEC-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; VEC:       pred.store.continue:
-; VEC-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; VEC-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; VEC-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
+; VEC-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if1:
-; VEC-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 1
-; VEC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[TMP14]]
-; VEC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; VEC-NEXT:    store i32 [[TMP16]], i32* [[TMP15]], align 4
+; VEC-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; VEC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP13]]
+; VEC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; VEC-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.continue2:
-; VEC-NEXT:    [[TMP17:%.*]] = add <2 x i32> [[VEC_PHI]], <i32 1, i32 1>
-; VEC-NEXT:    [[TMP18:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
-; VEC-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP18]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP17]]
+; VEC-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[VEC_PHI]], <i32 1, i32 1>
+; VEC-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
+; VEC-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP17]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP16]]
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VEC-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VEC-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VEC-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VEC:       middle.block:
-; VEC-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI]])
+; VEC-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI]])
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
-; VEC-NEXT:    [[TMP21:%.*]] = xor i1 [[CMP_N]], true
-; VEC-NEXT:    call void @llvm.assume(i1 [[TMP21]])
+; VEC-NEXT:    [[TMP20:%.*]] = xor i1 [[CMP_N]], true
+; VEC-NEXT:    call void @llvm.assume(i1 [[TMP20]])
 ; VEC-NEXT:    br label [[SCALAR_PH]]
 ; VEC:       scalar.ph:
 ; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[V_1]], [[ENTRY:%.*]] ]
-; VEC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[V_2]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[V_2]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    br label [[FOR_BODY14:%.*]]
 ; VEC:       for.body14:
 ; VEC-NEXT:    [[INDVARS_IV3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_INC23:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; VEC-NEXT:    [[INEWCHUNKS_120:%.*]] = phi i32 [ [[INEWCHUNKS_2:%.*]], [[FOR_INC23]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; VEC-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDVARS_IV3]]
-; VEC-NEXT:    [[TMP:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4
+; VEC-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[INDVARS_IV3]]
+; VEC-NEXT:    [[TMP:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4
 ; VEC-NEXT:    br i1 [[COND_2]], label [[IF_THEN18:%.*]], label [[FOR_INC23]]
 ; VEC:       if.then18:
-; VEC-NEXT:    store i32 [[TMP]], i32* [[ARRAYIDX16]], align 4
+; VEC-NEXT:    store i32 [[TMP]], ptr [[ARRAYIDX16]], align 4
 ; VEC-NEXT:    [[INC21:%.*]] = add nsw i32 [[INEWCHUNKS_120]], 1
 ; VEC-NEXT:    br label [[FOR_INC23]]
 ; VEC:       for.inc23:
@@ -430,12 +428,12 @@ for.body9:
 for.body14:
   %indvars.iv3 = phi i64 [ %indvars.iv.next4, %for.inc23 ], [ %v.1, %for.body9 ]
   %iNewChunks.120 = phi i32 [ %iNewChunks.2, %for.inc23 ], [ %v.2, %for.body9 ]
-  %arrayidx16 = getelementptr inbounds [768 x i32], [768 x i32]* %ptr, i64 0, i64 %indvars.iv3
-  %tmp = load i32, i32* %arrayidx16, align 4
+  %arrayidx16 = getelementptr inbounds [768 x i32], ptr %ptr, i64 0, i64 %indvars.iv3
+  %tmp = load i32, ptr %arrayidx16, align 4
   br i1 %cond.2, label %if.then18, label %for.inc23
 
 if.then18:
-  store i32 %tmp, i32* %arrayidx16, align 4
+  store i32 %tmp, ptr %arrayidx16, align 4
   %inc21 = add nsw i32 %iNewChunks.120, 1
   br label %for.inc23
 
@@ -460,39 +458,39 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; UNROLL-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE6]]
+; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
+; UNROLL-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; UNROLL:       pred.store.if:
-; UNROLL-NEXT:    [[INDUCTION3:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION3]]
-; UNROLL-NEXT:    [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1
-; UNROLL-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
-; UNROLL-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; UNROLL-NEXT:    store i8 [[TMP3]], i8* [[TMP0]], align 1
-; UNROLL-NEXT:    [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION4]]
-; UNROLL-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
-; UNROLL-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
-; UNROLL-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
-; UNROLL-NEXT:    store i8 [[TMP7]], i8* [[TMP4]], align 1
-; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; UNROLL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]]
+; UNROLL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; UNROLL-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
+; UNROLL-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; UNROLL-NEXT:    store i8 [[TMP4]], ptr [[TMP1]], align 1
+; UNROLL-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr undef, i64 [[TMP5]]
+; UNROLL-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP6]], align 1
+; UNROLL-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
+; UNROLL-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; UNROLL-NEXT:    store i8 [[TMP9]], ptr [[TMP6]], align 1
+; UNROLL-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; UNROLL:       pred.store.continue3:
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; UNROLL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; UNROLL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; UNROLL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ undef, [[MIDDLE_BLOCK]] ]
 ; UNROLL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ undef, [[MIDDLE_BLOCK]] ]
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]]
+; UNROLL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
 ; UNROLL-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; UNROLL:       if.then:
 ; UNROLL-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
 ; UNROLL-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
-; UNROLL-NEXT:    store i8 [[TMP5]], i8* [[TMP2]], align 1
+; UNROLL-NEXT:    store i8 [[TMP5]], ptr [[TMP2]], align 1
 ; UNROLL-NEXT:    br label [[FOR_INC]]
 ; UNROLL:       for.inc:
 ; UNROLL-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
@@ -508,30 +506,30 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NOSIMPLIFY:       vector.ph:
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION3:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION3]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP3]], i8* [[TMP0]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP4]], ptr [[TMP1]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue:
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if2:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP7]], i8* [[TMP4]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr undef, i64 [[TMP5]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP6]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP9]], ptr [[TMP6]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue3:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -542,13 +540,13 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NOSIMPLIFY:       for.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; UNROLL-NOSIMPLIFY:       if.then:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP5]], i8* [[TMP2]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP5]], ptr [[TMP2]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC]]
 ; UNROLL-NOSIMPLIFY:       for.inc:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
@@ -566,47 +564,46 @@ define void @minimal_bit_widths(i1 %c) {
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
 ; VEC-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
-; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i32 0
-; VEC-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <2 x i8>*
-; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP4]], align 1
-; VEC-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; VEC-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]]
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; VEC-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
+; VEC-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; VEC:       pred.store.if:
-; VEC-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
-; VEC-NEXT:    [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
-; VEC-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
-; VEC-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
-; VEC-NEXT:    store i8 [[TMP8]], i8* [[TMP9]], align 1
+; VEC-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
+; VEC-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; VEC-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
+; VEC-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]]
+; VEC-NEXT:    store i8 [[TMP6]], ptr [[TMP7]], align 1
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; VEC:       pred.store.continue:
-; VEC-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; VEC-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
+; VEC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
+; VEC-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; VEC:       pred.store.if2:
-; VEC-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
-; VEC-NEXT:    [[TMP12:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
-; VEC-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP12]] to i32
-; VEC-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
-; VEC-NEXT:    [[TMP15:%.*]] = getelementptr i8, i8* undef, i64 [[TMP11]]
-; VEC-NEXT:    store i8 [[TMP14]], i8* [[TMP15]], align 1
+; VEC-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; VEC-NEXT:    [[TMP10:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
+; VEC-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
+; VEC-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
+; VEC-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr undef, i64 [[TMP9]]
+; VEC-NEXT:    store i8 [[TMP12]], ptr [[TMP13]], align 1
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; VEC:       pred.store.continue3:
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VEC-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; VEC-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VEC-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; VEC-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VEC:       middle.block:
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; VEC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; VEC:       for.body:
 ; VEC-NEXT:    [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ undef, [[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ undef, [[MIDDLE_BLOCK]] ]
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
-; VEC-NEXT:    [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]]
+; VEC-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
 ; VEC-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; VEC:       if.then:
 ; VEC-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
 ; VEC-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
-; VEC-NEXT:    store i8 [[TMP5]], i8* [[TMP2]], align 1
+; VEC-NEXT:    store i8 [[TMP5]], ptr [[TMP2]], align 1
 ; VEC-NEXT:    br label [[FOR_INC]]
 ; VEC:       for.inc:
 ; VEC-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
@@ -622,14 +619,14 @@ entry:
 for.body:
   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ undef, %entry ]
-  %tmp2 = getelementptr i8, i8* undef, i64 %tmp0
-  %tmp3 = load i8, i8* %tmp2, align 1
+  %tmp2 = getelementptr i8, ptr undef, i64 %tmp0
+  %tmp3 = load i8, ptr %tmp2, align 1
   br i1 %c, label %if.then, label %for.inc
 
 if.then:
   %tmp4 = zext i8 %tmp3 to i32
   %tmp5 = trunc i32 %tmp4 to i8
-  store i8 %tmp5, i8* %tmp2, align 1
+  store i8 %tmp5, ptr %tmp2, align 1
   br label %for.inc
 
 for.inc:
@@ -642,21 +639,21 @@ for.end:
   ret void
 }
 
-define void @minimal_bit_widths_with_aliasing_store(i1 %c, i8* %ptr) {
+define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) {
 ; UNROLL-LABEL: @minimal_bit_widths_with_aliasing_store(
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; UNROLL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[TMP0]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1
-; UNROLL-NEXT:    store i8 0, i8* [[TMP2]], align 1
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]]
+; UNROLL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
+; UNROLL-NEXT:    store i8 0, ptr [[TMP2]], align 1
 ; UNROLL-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; UNROLL:       if.then:
 ; UNROLL-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
 ; UNROLL-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
-; UNROLL-NEXT:    store i8 [[TMP5]], i8* [[TMP2]], align 1
+; UNROLL-NEXT:    store i8 [[TMP5]], ptr [[TMP2]], align 1
 ; UNROLL-NEXT:    br label [[FOR_INC]]
 ; UNROLL:       for.inc:
 ; UNROLL-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
@@ -672,32 +669,32 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, i8* %ptr) {
 ; UNROLL-NOSIMPLIFY:       vector.ph:
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[INDUCTION]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[INDUCTION2]]
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 0, i8* [[TMP0]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 0, i8* [[TMP1]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP1]]
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 0, ptr [[TMP2]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 0, ptr [[TMP3]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP4]], i8* [[TMP0]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP6]], ptr [[TMP2]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue:
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if2:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP7]], i8* [[TMP1]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP3]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP9]], ptr [[TMP3]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; UNROLL-NOSIMPLIFY:       pred.store.continue3:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, 0
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -708,14 +705,14 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, i8* %ptr) {
 ; UNROLL-NOSIMPLIFY:       for.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[TMP0]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 0, i8* [[TMP2]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP0]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 0, ptr [[TMP2]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; UNROLL-NOSIMPLIFY:       if.then:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP5]], i8* [[TMP2]], align 1
+; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP5]], ptr [[TMP2]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC]]
 ; UNROLL-NOSIMPLIFY:       for.inc:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
@@ -731,14 +728,14 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, i8* %ptr) {
 ; VEC:       for.body:
 ; VEC-NEXT:    [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; VEC-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[TMP0]]
-; VEC-NEXT:    [[TMP3:%.*]] = load i8, i8* [[TMP2]], align 1
-; VEC-NEXT:    store i8 0, i8* [[TMP2]], align 1
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]]
+; VEC-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
+; VEC-NEXT:    store i8 0, ptr [[TMP2]], align 1
 ; VEC-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; VEC:       if.then:
 ; VEC-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
 ; VEC-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
-; VEC-NEXT:    store i8 [[TMP5]], i8* [[TMP2]], align 1
+; VEC-NEXT:    store i8 [[TMP5]], ptr [[TMP2]], align 1
 ; VEC-NEXT:    br label [[FOR_INC]]
 ; VEC:       for.inc:
 ; VEC-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
@@ -754,15 +751,15 @@ entry:
 for.body:
   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ]
-  %tmp2 = getelementptr i8, i8* %ptr, i64 %tmp0
-  %tmp3 = load i8, i8* %tmp2, align 1
-  store i8 0, i8* %tmp2
+  %tmp2 = getelementptr i8, ptr %ptr, i64 %tmp0
+  %tmp3 = load i8, ptr %tmp2, align 1
+  store i8 0, ptr %tmp2
   br i1 %c, label %if.then, label %for.inc
 
 if.then:
   %tmp4 = zext i8 %tmp3 to i32
   %tmp5 = trunc i32 %tmp4 to i8
-  store i8 %tmp5, i8* %tmp2, align 1
+  store i8 %tmp5, ptr %tmp2, align 1
   br label %for.inc
 
 for.inc:

diff  --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 6b7f09a37a48..c496642e499e 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Make sure that we can handle multiple integer induction variables.
 ;
-define void @multi_int_induction(i32* %A, i32 %N) {
+define void @multi_int_induction(ptr %A, i32 %N) {
 ; CHECK-LABEL: @multi_int_induction(
 ; CHECK-NEXT:  for.body.lr.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
@@ -20,21 +20,20 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END:%.*]] = add i32 190, [[CAST_VTC]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = add i32 190, [[DOTCAST]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 190, i32 191>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -45,8 +44,8 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[COUNT_09:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[COUNT_09]], i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[COUNT_09]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[COUNT_09]], 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
@@ -64,19 +63,18 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], -2
-; IND-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
-; IND-NEXT:    [[IND_END:%.*]] = add i32 [[CAST_VTC]], 190
+; IND-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; IND-NEXT:    [[IND_END:%.*]] = add i32 [[DOTCAST]], 190
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 190, i32 191>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; IND-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4
+; IND-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; IND-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IND-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -87,8 +85,8 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; IND:       for.body:
 ; IND-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IND-NEXT:    [[COUNT_09:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; IND-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; IND-NEXT:    store i32 [[COUNT_09]], i32* [[ARRAYIDX2]], align 4
+; IND-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; IND-NEXT:    store i32 [[COUNT_09]], ptr [[ARRAYIDX2]], align 4
 ; IND-NEXT:    [[INC]] = add nsw i32 [[COUNT_09]], 1
 ; IND-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; IND-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
@@ -106,23 +104,21 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], -4
-; UNROLL-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
-; UNROLL-NEXT:    [[IND_END:%.*]] = add i32 [[CAST_VTC]], 190
+; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; UNROLL-NEXT:    [[IND_END:%.*]] = add i32 [[DOTCAST]], 190
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 190, i32 191>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4
-; UNROLL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 2
-; UNROLL-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP6]], align 4
+; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
+; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
-; UNROLL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; UNROLL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -133,8 +129,8 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NEXT:    [[COUNT_09:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; UNROLL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; UNROLL-NEXT:    store i32 [[COUNT_09]], i32* [[ARRAYIDX2]], align 4
+; UNROLL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; UNROLL-NEXT:    store i32 [[COUNT_09]], ptr [[ARRAYIDX2]], align 4
 ; UNROLL-NEXT:    [[INC]] = add nsw i32 [[COUNT_09]], 1
 ; UNROLL-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; UNROLL-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
@@ -153,8 +149,8 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i32 190, [[CAST_VTC]]
+; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i32 190, [[DOTCAST]]
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -162,18 +158,16 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP8]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP10]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP8]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -184,8 +178,8 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[COUNT_09:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; UNROLL-NO-IC-NEXT:    store i32 [[COUNT_09]], i32* [[ARRAYIDX2]], align 4
+; UNROLL-NO-IC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; UNROLL-NO-IC-NEXT:    store i32 [[COUNT_09]], ptr [[ARRAYIDX2]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INC]] = add nsw i32 [[COUNT_09]], 1
 ; UNROLL-NO-IC-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; UNROLL-NO-IC-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
@@ -203,23 +197,21 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], -8
-; INTERLEAVE-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
-; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i32 [[CAST_VTC]], 190
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i32 [[DOTCAST]], 190
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 190, i32 191, i32 192, i32 193>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP4]], align 4
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 4
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP6]], align 4
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -230,8 +222,8 @@ define void @multi_int_induction(i32* %A, i32 %N) {
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[COUNT_09:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; INTERLEAVE-NEXT:    store i32 [[COUNT_09]], i32* [[ARRAYIDX2]], align 4
+; INTERLEAVE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; INTERLEAVE-NEXT:    store i32 [[COUNT_09]], ptr [[ARRAYIDX2]], align 4
 ; INTERLEAVE-NEXT:    [[INC]] = add nsw i32 [[COUNT_09]], 1
 ; INTERLEAVE-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; INTERLEAVE-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
@@ -246,8 +238,8 @@ for.body.lr.ph:
 for.body:
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
   %count.09 = phi i32 [ 190, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  store i32 %count.09, i32* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  store i32 %count.09, ptr %arrayidx2, align 4
   %inc = add nsw i32 %count.09, 1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
@@ -266,24 +258,23 @@ for.end:
 
 ;   Vectorized induction variable.
 
-define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
+define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
 ; CHECK-LABEL: @scalar_use(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[OFFSET:%.*]]
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], [[OFFSET]]
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[SCEVGEP23:%.*]] = bitcast float* [[SCEVGEP2]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 [[OFFSET2:%.*]]
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[N]], [[OFFSET2]]
-; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[SCEVGEP67:%.*]] = bitcast float* [[SCEVGEP6]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP67]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP45]], [[SCEVGEP23]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[OFFSET:%.*]], 2
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[OFFSET2:%.*]], 2
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -294,21 +285,18 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4, !alias.scope !4, !noalias !7
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP2]], [[OFFSET2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x float>, <2 x float>* [[TMP10]], align 4, !alias.scope !7
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP5]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP12]], <2 x float>* [[TMP13]], align 4, !alias.scope !4, !noalias !7
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP8]], align 4, !alias.scope !4, !noalias !7
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP5]], [[OFFSET2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP11]], align 4, !alias.scope !7
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP12]]
+; CHECK-NEXT:    store <2 x float> [[TMP13]], ptr [[TMP8]], align 4, !alias.scope !4, !noalias !7
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -321,14 +309,14 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]]
-; CHECK-NEXT:    [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM]]
-; CHECK-NEXT:    [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4
+; CHECK-NEXT:    [[ARR_IDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM]]
+; CHECK-NEXT:    [[L1:%.*]] = load float, ptr [[ARR_IDX]], align 4
 ; CHECK-NEXT:    [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]]
-; CHECK-NEXT:    [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM2]]
-; CHECK-NEXT:    [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4
+; CHECK-NEXT:    [[ARR_IDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM2]]
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4
 ; CHECK-NEXT:    [[M:%.*]] = fmul fast float [[B]], [[L2]]
 ; CHECK-NEXT:    [[AD:%.*]] = fadd fast float [[L1]], [[M]]
-; CHECK-NEXT:    store float [[AD]], float* [[ARR_IDX]], align 4
+; CHECK-NEXT:    store float [[AD]], ptr [[ARR_IDX]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -340,14 +328,17 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; IND:       vector.memcheck:
-; IND-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[OFFSET:%.*]]
-; IND-NEXT:    [[TMP0:%.*]] = add i64 [[N]], [[OFFSET]]
-; IND-NEXT:    [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]]
-; IND-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 [[OFFSET2:%.*]]
-; IND-NEXT:    [[TMP1:%.*]] = add i64 [[N]], [[OFFSET2]]
-; IND-NEXT:    [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]]
-; IND-NEXT:    [[BOUND0:%.*]] = icmp ult float* [[SCEVGEP]], [[SCEVGEP6]]
-; IND-NEXT:    [[BOUND1:%.*]] = icmp ult float* [[SCEVGEP4]], [[SCEVGEP2]]
+; IND-NEXT:    [[TMP0:%.*]] = shl i64 [[OFFSET:%.*]], 2
+; IND-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
+; IND-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 2
+; IND-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
+; IND-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; IND-NEXT:    [[TMP3:%.*]] = shl i64 [[OFFSET2:%.*]], 2
+; IND-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; IND-NEXT:    [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; IND-NEXT:    [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; IND-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]]
+; IND-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]]
 ; IND-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; IND-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
@@ -357,18 +348,15 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], [[OFFSET]]
-; IND-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; IND-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
-; IND-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4, !alias.scope !4, !noalias !7
-; IND-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], [[OFFSET2]]
-; IND-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
-; IND-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>*
-; IND-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4, !alias.scope !7
-; IND-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD8]]
-; IND-NEXT:    [[TMP9:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP8]]
-; IND-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
-; IND-NEXT:    store <2 x float> [[TMP9]], <2 x float>* [[TMP10]], align 4, !alias.scope !4, !noalias !7
+; IND-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], [[OFFSET]]
+; IND-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
+; IND-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7
+; IND-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[OFFSET2]]
+; IND-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; IND-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP8]], align 4, !alias.scope !7
+; IND-NEXT:    [[TMP9:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD4]]
+; IND-NEXT:    [[TMP10:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP9]]
+; IND-NEXT:    store <2 x float> [[TMP10]], ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -381,14 +369,14 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; IND:       for.body:
 ; IND-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IND-NEXT:    [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]]
-; IND-NEXT:    [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM]]
-; IND-NEXT:    [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4
+; IND-NEXT:    [[ARR_IDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM]]
+; IND-NEXT:    [[L1:%.*]] = load float, ptr [[ARR_IDX]], align 4
 ; IND-NEXT:    [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]]
-; IND-NEXT:    [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM2]]
-; IND-NEXT:    [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4
+; IND-NEXT:    [[ARR_IDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM2]]
+; IND-NEXT:    [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4
 ; IND-NEXT:    [[M:%.*]] = fmul fast float [[L2]], [[B]]
 ; IND-NEXT:    [[AD:%.*]] = fadd fast float [[L1]], [[M]]
-; IND-NEXT:    store float [[AD]], float* [[ARR_IDX]], align 4
+; IND-NEXT:    store float [[AD]], ptr [[ARR_IDX]], align 4
 ; IND-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; IND-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; IND-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -400,50 +388,47 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
 ; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; UNROLL:       vector.memcheck:
-; UNROLL-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[OFFSET:%.*]]
-; UNROLL-NEXT:    [[TMP0:%.*]] = add i64 [[N]], [[OFFSET]]
-; UNROLL-NEXT:    [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]]
-; UNROLL-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 [[OFFSET2:%.*]]
-; UNROLL-NEXT:    [[TMP1:%.*]] = add i64 [[N]], [[OFFSET2]]
-; UNROLL-NEXT:    [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]]
-; UNROLL-NEXT:    [[BOUND0:%.*]] = icmp ult float* [[SCEVGEP]], [[SCEVGEP6]]
-; UNROLL-NEXT:    [[BOUND1:%.*]] = icmp ult float* [[SCEVGEP4]], [[SCEVGEP2]]
+; UNROLL-NEXT:    [[TMP0:%.*]] = shl i64 [[OFFSET:%.*]], 2
+; UNROLL-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
+; UNROLL-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 2
+; UNROLL-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
+; UNROLL-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; UNROLL-NEXT:    [[TMP3:%.*]] = shl i64 [[OFFSET2:%.*]], 2
+; UNROLL-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; UNROLL-NEXT:    [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; UNROLL-NEXT:    [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; UNROLL-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]]
+; UNROLL-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]]
 ; UNROLL-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; UNROLL-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i64 0
 ; UNROLL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <2 x float> poison, float [[B]], i64 0
-; UNROLL-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT11]], <2 x float> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <2 x float> poison, float [[B]], i64 0
+; UNROLL-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT7]], <2 x float> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], [[OFFSET]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; UNROLL-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
-; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4, !alias.scope !4, !noalias !7
-; UNROLL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP3]], i64 2
-; UNROLL-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <2 x float>*
-; UNROLL-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4, !alias.scope !4, !noalias !7
-; UNROLL-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[OFFSET2]]
-; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]]
-; UNROLL-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
-; UNROLL-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !alias.scope !7
-; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 2
-; UNROLL-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <2 x float>*
-; UNROLL-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, <2 x float>* [[TMP11]], align 4, !alias.scope !7
-; UNROLL-NEXT:    [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD9]]
-; UNROLL-NEXT:    [[TMP13:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT12]], [[WIDE_LOAD10]]
-; UNROLL-NEXT:    [[TMP14:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP12]]
-; UNROLL-NEXT:    [[TMP15:%.*]] = fadd fast <2 x float> [[WIDE_LOAD8]], [[TMP13]]
-; UNROLL-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
-; UNROLL-NEXT:    store <2 x float> [[TMP14]], <2 x float>* [[TMP16]], align 4, !alias.scope !4, !noalias !7
-; UNROLL-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP5]] to <2 x float>*
-; UNROLL-NEXT:    store <2 x float> [[TMP15]], <2 x float>* [[TMP17]], align 4, !alias.scope !4, !noalias !7
+; UNROLL-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], [[OFFSET]]
+; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
+; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7
+; UNROLL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 2
+; UNROLL-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP7]], align 4, !alias.scope !4, !noalias !7
+; UNROLL-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[OFFSET2]]
+; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; UNROLL-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP9]], align 4, !alias.scope !7
+; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 2
+; UNROLL-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP10]], align 4, !alias.scope !7
+; UNROLL-NEXT:    [[TMP11:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]]
+; UNROLL-NEXT:    [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT8]], [[WIDE_LOAD6]]
+; UNROLL-NEXT:    [[TMP13:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP11]]
+; UNROLL-NEXT:    [[TMP14:%.*]] = fadd fast <2 x float> [[WIDE_LOAD4]], [[TMP12]]
+; UNROLL-NEXT:    store <2 x float> [[TMP13]], ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7
+; UNROLL-NEXT:    store <2 x float> [[TMP14]], ptr [[TMP7]], align 4, !alias.scope !4, !noalias !7
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -453,14 +438,14 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NEXT:    [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]]
-; UNROLL-NEXT:    [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM]]
-; UNROLL-NEXT:    [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4
+; UNROLL-NEXT:    [[ARR_IDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM]]
+; UNROLL-NEXT:    [[L1:%.*]] = load float, ptr [[ARR_IDX]], align 4
 ; UNROLL-NEXT:    [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]]
-; UNROLL-NEXT:    [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM2]]
-; UNROLL-NEXT:    [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4
+; UNROLL-NEXT:    [[ARR_IDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM2]]
+; UNROLL-NEXT:    [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4
 ; UNROLL-NEXT:    [[M:%.*]] = fmul fast float [[L2]], [[B]]
 ; UNROLL-NEXT:    [[AD:%.*]] = fadd fast float [[L1]], [[M]]
-; UNROLL-NEXT:    store float [[AD]], float* [[ARR_IDX]], align 4
+; UNROLL-NEXT:    store float [[AD]], ptr [[ARR_IDX]], align 4
 ; UNROLL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; UNROLL-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; UNROLL-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -472,18 +457,17 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; UNROLL-NO-IC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
 ; UNROLL-NO-IC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; UNROLL-NO-IC:       vector.memcheck:
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[OFFSET:%.*]]
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP1:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i64 [[N]], [[OFFSET]]
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]]
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP23:%.*]] = bitcast float* [[SCEVGEP2]] to i8*
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 [[OFFSET2:%.*]]
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i64 [[N]], [[OFFSET2]]
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP67:%.*]] = bitcast float* [[SCEVGEP6]] to i8*
-; UNROLL-NO-IC-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP67]]
-; UNROLL-NO-IC-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP45]], [[SCEVGEP23]]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = shl i64 [[OFFSET:%.*]], 2
+; UNROLL-NO-IC-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
+; UNROLL-NO-IC-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = shl i64 [[OFFSET2:%.*]], 2
+; UNROLL-NO-IC-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; UNROLL-NO-IC-NEXT:    [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; UNROLL-NO-IC-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]]
+; UNROLL-NO-IC-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]]
 ; UNROLL-NO-IC-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-IC:       vector.ph:
@@ -491,44 +475,38 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <2 x float> poison, float [[B]], i32 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT11]], <2 x float> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <2 x float> poison, float [[B]], i32 0
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT7]], <2 x float> poison, <2 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], [[OFFSET]]
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], [[OFFSET]]
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !alias.scope !4, !noalias !7
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <2 x float>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x float>, <2 x float>* [[TMP11]], align 4, !alias.scope !4, !noalias !7
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = add i64 [[TMP2]], [[OFFSET2]]
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = add i64 [[TMP3]], [[OFFSET2]]
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]]
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]]
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <2 x float>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x float>, <2 x float>* [[TMP17]], align 4, !alias.scope !7
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, <2 x float>* [[TMP19]], align 4, !alias.scope !7
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD9]]
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT12]], [[WIDE_LOAD10]]
-; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP20]]
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = fadd fast <2 x float> [[WIDE_LOAD8]], [[TMP21]]
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
-; UNROLL-NO-IC-NEXT:    store <2 x float> [[TMP22]], <2 x float>* [[TMP24]], align 4, !alias.scope !4, !noalias !7
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP10]] to <2 x float>*
-; UNROLL-NO-IC-NEXT:    store <2 x float> [[TMP23]], <2 x float>* [[TMP25]], align 4, !alias.scope !4, !noalias !7
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], [[OFFSET]]
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add i64 [[TMP6]], [[OFFSET]]
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP11]], align 4, !alias.scope !4, !noalias !7
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP12]], align 4, !alias.scope !4, !noalias !7
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[OFFSET2]]
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = add i64 [[TMP6]], [[OFFSET2]]
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP17]], align 4, !alias.scope !7
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP18]], align 4, !alias.scope !7
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]]
+; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT8]], [[WIDE_LOAD6]]
+; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP19]]
+; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = fadd fast <2 x float> [[WIDE_LOAD4]], [[TMP20]]
+; UNROLL-NO-IC-NEXT:    store <2 x float> [[TMP21]], ptr [[TMP11]], align 4, !alias.scope !4, !noalias !7
+; UNROLL-NO-IC-NEXT:    store <2 x float> [[TMP22]], ptr [[TMP12]], align 4, !alias.scope !4, !noalias !7
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -538,14 +516,14 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]]
-; UNROLL-NO-IC-NEXT:    [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM]]
-; UNROLL-NO-IC-NEXT:    [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4
+; UNROLL-NO-IC-NEXT:    [[ARR_IDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM]]
+; UNROLL-NO-IC-NEXT:    [[L1:%.*]] = load float, ptr [[ARR_IDX]], align 4
 ; UNROLL-NO-IC-NEXT:    [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]]
-; UNROLL-NO-IC-NEXT:    [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM2]]
-; UNROLL-NO-IC-NEXT:    [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4
+; UNROLL-NO-IC-NEXT:    [[ARR_IDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM2]]
+; UNROLL-NO-IC-NEXT:    [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4
 ; UNROLL-NO-IC-NEXT:    [[M:%.*]] = fmul fast float [[B]], [[L2]]
 ; UNROLL-NO-IC-NEXT:    [[AD:%.*]] = fadd fast float [[L1]], [[M]]
-; UNROLL-NO-IC-NEXT:    store float [[AD]], float* [[ARR_IDX]], align 4
+; UNROLL-NO-IC-NEXT:    store float [[AD]], ptr [[ARR_IDX]], align 4
 ; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -557,50 +535,47 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
 ; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; INTERLEAVE:       vector.memcheck:
-; INTERLEAVE-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[OFFSET:%.*]]
-; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], [[OFFSET]]
-; INTERLEAVE-NEXT:    [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]]
-; INTERLEAVE-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 [[OFFSET2:%.*]]
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i64 [[N]], [[OFFSET2]]
-; INTERLEAVE-NEXT:    [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]]
-; INTERLEAVE-NEXT:    [[BOUND0:%.*]] = icmp ult float* [[SCEVGEP]], [[SCEVGEP6]]
-; INTERLEAVE-NEXT:    [[BOUND1:%.*]] = icmp ult float* [[SCEVGEP4]], [[SCEVGEP2]]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = shl i64 [[OFFSET:%.*]], 2
+; INTERLEAVE-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
+; INTERLEAVE-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = shl i64 [[OFFSET2:%.*]], 2
+; INTERLEAVE-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; INTERLEAVE-NEXT:    [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; INTERLEAVE-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]]
+; INTERLEAVE-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]]
 ; INTERLEAVE-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; INTERLEAVE-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -8
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x float> poison, float [[B]], i64 0
-; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT11]], <4 x float> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x float> poison, float [[B]], i64 0
+; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT7]], <4 x float> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], [[OFFSET]]
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4, !alias.scope !4, !noalias !7
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP3]], i64 4
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
-; INTERLEAVE-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !alias.scope !4, !noalias !7
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[OFFSET2]]
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]]
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; INTERLEAVE-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !7
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 4
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; INTERLEAVE-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4, !alias.scope !7
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD9]]
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT12]], [[WIDE_LOAD10]]
-; INTERLEAVE-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[TMP12]]
-; INTERLEAVE-NEXT:    [[TMP15:%.*]] = fadd fast <4 x float> [[WIDE_LOAD8]], [[TMP13]]
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; INTERLEAVE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, !alias.scope !4, !noalias !7
-; INTERLEAVE-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
-; INTERLEAVE-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[TMP17]], align 4, !alias.scope !4, !noalias !7
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], [[OFFSET]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4
+; INTERLEAVE-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP7]], align 4, !alias.scope !4, !noalias !7
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[OFFSET2]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope !7
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 4
+; INTERLEAVE-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !alias.scope !7
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]]
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT8]], [[WIDE_LOAD6]]
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[TMP11]]
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD4]], [[TMP12]]
+; INTERLEAVE-NEXT:    store <4 x float> [[TMP13]], ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7
+; INTERLEAVE-NEXT:    store <4 x float> [[TMP14]], ptr [[TMP7]], align 4, !alias.scope !4, !noalias !7
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -610,14 +585,14 @@ define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n)
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]]
-; INTERLEAVE-NEXT:    [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM]]
-; INTERLEAVE-NEXT:    [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4
+; INTERLEAVE-NEXT:    [[ARR_IDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM]]
+; INTERLEAVE-NEXT:    [[L1:%.*]] = load float, ptr [[ARR_IDX]], align 4
 ; INTERLEAVE-NEXT:    [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]]
-; INTERLEAVE-NEXT:    [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IND_SUM2]]
-; INTERLEAVE-NEXT:    [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4
+; INTERLEAVE-NEXT:    [[ARR_IDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IND_SUM2]]
+; INTERLEAVE-NEXT:    [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4
 ; INTERLEAVE-NEXT:    [[M:%.*]] = fmul fast float [[L2]], [[B]]
 ; INTERLEAVE-NEXT:    [[AD:%.*]] = fadd fast float [[L1]], [[M]]
-; INTERLEAVE-NEXT:    store float [[AD]], float* [[ARR_IDX]], align 4
+; INTERLEAVE-NEXT:    store float [[AD]], ptr [[ARR_IDX]], align 4
 ; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -630,14 +605,14 @@ entry:
 for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %ind.sum = add i64 %iv, %offset
-  %arr.idx = getelementptr inbounds float, float* %a, i64 %ind.sum
-  %l1 = load float, float* %arr.idx, align 4
+  %arr.idx = getelementptr inbounds float, ptr %a, i64 %ind.sum
+  %l1 = load float, ptr %arr.idx, align 4
   %ind.sum2 = add i64 %iv, %offset2
-  %arr.idx2 = getelementptr inbounds float, float* %a, i64 %ind.sum2
-  %l2 = load float, float* %arr.idx2, align 4
+  %arr.idx2 = getelementptr inbounds float, ptr %a, i64 %ind.sum2
+  %l2 = load float, ptr %arr.idx2, align 4
   %m = fmul fast float %b, %l2
   %ad = fadd fast float %l1, %m
-  store float %ad, float* %arr.idx, align 4
+  store float %ad, ptr %arr.idx, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv.next, %n
   br i1 %exitcond, label %loopexit, label %for.body
@@ -656,7 +631,7 @@ loopexit:
 ;
 ;
 
-define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
+define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; CHECK-LABEL: @scalarize_induction_variable_01(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
@@ -668,36 +643,35 @@ define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <2 x i64>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[TMP9:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP9]] = add i64 [[TMP8]], [[SUM]]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[TMP8:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8]] = add i64 [[TMP7]], [[SUM]]
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i64 [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[TMP9]]
 ;
 ; IND-LABEL: @scalarize_induction_variable_01(
 ; IND-NEXT:  entry:
@@ -709,34 +683,33 @@ define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
-; IND-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
-; IND-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; IND-NEXT:    [[TMP2]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; IND-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; IND-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; IND-NEXT:    [[TMP1]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; IND-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; IND:       middle.block:
-; IND-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; IND-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP1]])
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; IND:       scalar.ph:
 ; IND-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; IND-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IND-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; IND-NEXT:    br label [[FOR_BODY:%.*]]
 ; IND:       for.body:
 ; IND-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; IND-NEXT:    [[SUM:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; IND-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I]]
-; IND-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP5]], align 8
-; IND-NEXT:    [[TMP7]] = add i64 [[TMP6]], [[SUM]]
+; IND-NEXT:    [[SUM:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; IND-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]]
+; IND-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8
+; IND-NEXT:    [[TMP6]] = add i64 [[TMP5]], [[SUM]]
 ; IND-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; IND-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; IND-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
 ; IND:       for.end:
-; IND-NEXT:    [[TMP8:%.*]] = phi i64 [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; IND-NEXT:    ret i64 [[TMP8]]
+; IND-NEXT:    [[TMP7:%.*]] = phi i64 [ [[TMP6]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
+; IND-NEXT:    ret i64 [[TMP7]]
 ;
 ; UNROLL-LABEL: @scalarize_induction_variable_01(
 ; UNROLL-NEXT:  entry:
@@ -748,40 +721,38 @@ define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
-; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
-; UNROLL-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <2 x i64>*
-; UNROLL-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
-; UNROLL-NEXT:    [[TMP4]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
-; UNROLL-NEXT:    [[TMP5]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 2
+; UNROLL-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; UNROLL-NEXT:    [[TMP2]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; UNROLL-NEXT:    [[TMP3]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; UNROLL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP5]], [[TMP4]]
-; UNROLL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; UNROLL-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP3]], [[TMP2]]
+; UNROLL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[SUM:%.*]] = phi i64 [ [[TMP10:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I]]
-; UNROLL-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
-; UNROLL-NEXT:    [[TMP10]] = add i64 [[TMP9]], [[SUM]]
+; UNROLL-NEXT:    [[SUM:%.*]] = phi i64 [ [[TMP8:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]]
+; UNROLL-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
+; UNROLL-NEXT:    [[TMP8]] = add i64 [[TMP7]], [[SUM]]
 ; UNROLL-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; UNROLL-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; UNROLL-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
 ; UNROLL:       for.end:
-; UNROLL-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NEXT:    ret i64 [[TMP11]]
+; UNROLL-NEXT:    [[TMP9:%.*]] = phi i64 [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NEXT:    ret i64 [[TMP9]]
 ;
 ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01(
 ; UNROLL-NO-IC-NEXT:  entry:
@@ -794,44 +765,42 @@ define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP8]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
-; UNROLL-NO-IC-NEXT:    [[TMP9]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP6]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; UNROLL-NO-IC-NEXT:    [[TMP7]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]]
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP7]], [[TMP6]]
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[SUM:%.*]] = phi i64 [ [[TMP14:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I]]
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP14]] = add i64 [[TMP13]], [[SUM]]
+; UNROLL-NO-IC-NEXT:    [[SUM:%.*]] = phi i64 [ [[TMP12:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP12]] = add i64 [[TMP11]], [[SUM]]
 ; UNROLL-NO-IC-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; UNROLL-NO-IC-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
 ; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = phi i64 [ [[TMP14]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-IC-NEXT:    ret i64 [[TMP15]]
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = phi i64 [ [[TMP12]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    ret i64 [[TMP13]]
 ;
 ; INTERLEAVE-LABEL: @scalarize_induction_variable_01(
 ; INTERLEAVE-NEXT:  entry:
@@ -843,40 +812,38 @@ define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>*
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, <4 x i64>* [[TMP1]], align 8
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 4
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <4 x i64>*
-; INTERLEAVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 8
-; INTERLEAVE-NEXT:    [[TMP4]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
-; INTERLEAVE-NEXT:    [[TMP5]] = add <4 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 4
+; INTERLEAVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; INTERLEAVE-NEXT:    [[TMP2]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; INTERLEAVE-NEXT:    [[TMP3]] = add <4 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; INTERLEAVE:       middle.block:
-; INTERLEAVE-NEXT:    [[BIN_RDX:%.*]] = add <4 x i64> [[TMP5]], [[TMP4]]
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX]])
+; INTERLEAVE-NEXT:    [[BIN_RDX:%.*]] = add <4 x i64> [[TMP3]], [[TMP2]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX]])
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
 ; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; INTERLEAVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label [[FOR_BODY:%.*]]
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[SUM:%.*]] = phi i64 [ [[TMP10:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I]]
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
-; INTERLEAVE-NEXT:    [[TMP10]] = add i64 [[TMP9]], [[SUM]]
+; INTERLEAVE-NEXT:    [[SUM:%.*]] = phi i64 [ [[TMP8:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
+; INTERLEAVE-NEXT:    [[TMP8]] = add i64 [[TMP7]], [[SUM]]
 ; INTERLEAVE-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; INTERLEAVE-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
 ; INTERLEAVE:       for.end:
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; INTERLEAVE-NEXT:    ret i64 [[TMP11]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = phi i64 [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i64 [[TMP9]]
 ;
 entry:
   br label %for.body
@@ -884,8 +851,8 @@ entry:
 for.body:
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
   %sum = phi i64 [ %2, %for.body ], [ 0, %entry ]
-  %0 = getelementptr inbounds i64, i64* %a, i64 %i
-  %1 = load i64, i64* %0, align 8
+  %0 = getelementptr inbounds i64, ptr %a, i64 %i
+  %1 = load i64, ptr %0, align 8
   %2 = add i64 %1, %sum
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
@@ -907,7 +874,7 @@ for.end:
 ;
 ;
 
-define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
+define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) {
 ; CHECK-LABEL: @scalarize_induction_variable_02(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 8)
@@ -927,16 +894,16 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i32 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = fadd fast <2 x float> [[VEC_PHI]], <float 1.000000e+00, float 1.000000e+00>
@@ -956,10 +923,10 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = fadd fast float [[S]], 1.000000e+00
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast float [[TMP26]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP28]] = fadd fast float [[TMP27]], [[TMP25]]
@@ -987,16 +954,16 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
 ; IND-NEXT:    [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
 ; IND-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 8
-; IND-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[OFFSET_IDX]]
-; IND-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; IND-NEXT:    [[TMP6:%.*]] = load float, float* [[TMP4]], align 4
-; IND-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP5]], align 4
+; IND-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; IND-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; IND-NEXT:    [[TMP6:%.*]] = load float, ptr [[TMP4]], align 4
+; IND-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP5]], align 4
 ; IND-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
 ; IND-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP7]], i64 1
-; IND-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[OFFSET_IDX]]
-; IND-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
-; IND-NEXT:    [[TMP12:%.*]] = load float, float* [[TMP10]], align 4
-; IND-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP11]], align 4
+; IND-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
+; IND-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
+; IND-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP10]], align 4
+; IND-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP11]], align 4
 ; IND-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0
 ; IND-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i64 1
 ; IND-NEXT:    [[TMP16:%.*]] = fadd fast <2 x float> [[VEC_PHI]], <float 1.000000e+00, float 1.000000e+00>
@@ -1016,10 +983,10 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
 ; IND:       for.body:
 ; IND-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
 ; IND-NEXT:    [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[FOR_BODY]] ]
-; IND-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I]]
-; IND-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP21]], align 4
-; IND-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I]]
-; IND-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP23]], align 4
+; IND-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]]
+; IND-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4
+; IND-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]]
+; IND-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4
 ; IND-NEXT:    [[TMP25:%.*]] = fadd fast float [[S]], 1.000000e+00
 ; IND-NEXT:    [[TMP26:%.*]] = fadd fast float [[TMP25]], [[TMP22]]
 ; IND-NEXT:    [[TMP27]] = fadd fast float [[TMP26]], [[TMP24]]
@@ -1050,28 +1017,28 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
 ; UNROLL-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 8
 ; UNROLL-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 16
 ; UNROLL-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 24
-; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[OFFSET_IDX]]
-; UNROLL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
-; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
-; UNROLL-NEXT:    [[TMP10:%.*]] = load float, float* [[TMP6]], align 4
-; UNROLL-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP7]], align 4
+; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; UNROLL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
+; UNROLL-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
+; UNROLL-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
 ; UNROLL-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0
 ; UNROLL-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP11]], i64 1
-; UNROLL-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP8]], align 4
-; UNROLL-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP9]], align 4
+; UNROLL-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP8]], align 4
+; UNROLL-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP9]], align 4
 ; UNROLL-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i64 0
 ; UNROLL-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP15]], i64 1
-; UNROLL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[OFFSET_IDX]]
-; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
-; UNROLL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]]
-; UNROLL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]]
-; UNROLL-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP18]], align 4
-; UNROLL-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP19]], align 4
+; UNROLL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
+; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
+; UNROLL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
+; UNROLL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
+; UNROLL-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP18]], align 4
+; UNROLL-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP19]], align 4
 ; UNROLL-NEXT:    [[TMP24:%.*]] = insertelement <2 x float> poison, float [[TMP22]], i64 0
 ; UNROLL-NEXT:    [[TMP25:%.*]] = insertelement <2 x float> [[TMP24]], float [[TMP23]], i64 1
-; UNROLL-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP20]], align 4
-; UNROLL-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP21]], align 4
+; UNROLL-NEXT:    [[TMP26:%.*]] = load float, ptr [[TMP20]], align 4
+; UNROLL-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP21]], align 4
 ; UNROLL-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i64 0
 ; UNROLL-NEXT:    [[TMP29:%.*]] = insertelement <2 x float> [[TMP28]], float [[TMP27]], i64 1
 ; UNROLL-NEXT:    [[TMP30:%.*]] = fadd fast <2 x float> [[VEC_PHI]], <float 1.000000e+00, float 1.000000e+00>
@@ -1095,10 +1062,10 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NEXT:    [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP44:%.*]], [[FOR_BODY]] ]
-; UNROLL-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I]]
-; UNROLL-NEXT:    [[TMP39:%.*]] = load float, float* [[TMP38]], align 4
-; UNROLL-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I]]
-; UNROLL-NEXT:    [[TMP41:%.*]] = load float, float* [[TMP40]], align 4
+; UNROLL-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]]
+; UNROLL-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4
+; UNROLL-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]]
+; UNROLL-NEXT:    [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4
 ; UNROLL-NEXT:    [[TMP42:%.*]] = fadd fast float [[S]], 1.000000e+00
 ; UNROLL-NEXT:    [[TMP43:%.*]] = fadd fast float [[TMP42]], [[TMP39]]
 ; UNROLL-NEXT:    [[TMP44]] = fadd fast float [[TMP43]], [[TMP41]]
@@ -1131,28 +1098,28 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 16
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 24
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP3]]
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]]
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP7]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = load float, float* [[TMP8]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP3]]
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP8]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP12]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP9]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP10]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP9]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP10]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP3]]
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]]
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]]
-; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP6]]
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP19]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP20]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP3]]
+; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
+; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
+; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP19]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP20]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP23]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP24]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP21]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP22]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP21]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load float, ptr [[TMP22]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP28]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = fadd fast <2 x float> [[VEC_PHI]], <float 1.000000e+00, float 1.000000e+00>
@@ -1176,10 +1143,10 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP45:%.*]], [[FOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I]]
-; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP39]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I]]
-; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP41]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]]
+; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = load float, ptr [[TMP39]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]]
+; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = load float, ptr [[TMP41]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP43:%.*]] = fadd fast float [[S]], 1.000000e+00
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = fadd fast float [[TMP43]], [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP45]] = fadd fast float [[TMP44]], [[TMP42]]
@@ -1207,58 +1174,54 @@ define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 32
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[OFFSET_IDX]]
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP6]] to <32 x float>*
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP7]] to <32 x float>*
-; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <32 x float>, <32 x float>* [[TMP8]], align 4
-; INTERLEAVE-NEXT:    [[WIDE_VEC2:%.*]] = load <32 x float>, <32 x float>* [[TMP9]], align 4
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
+; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP6]], align 4
+; INTERLEAVE-NEXT:    [[WIDE_VEC2:%.*]] = load <32 x float>, ptr [[TMP7]], align 4
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x float> [[WIDE_VEC]], <32 x float> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <32 x float> [[WIDE_VEC2]], <32 x float> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[OFFSET_IDX]]
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP5]]
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP10]] to <32 x float>*
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP11]] to <32 x float>*
-; INTERLEAVE-NEXT:    [[WIDE_VEC4:%.*]] = load <32 x float>, <32 x float>* [[TMP12]], align 4
-; INTERLEAVE-NEXT:    [[WIDE_VEC5:%.*]] = load <32 x float>, <32 x float>* [[TMP13]], align 4
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
+; INTERLEAVE-NEXT:    [[WIDE_VEC4:%.*]] = load <32 x float>, ptr [[TMP8]], align 4
+; INTERLEAVE-NEXT:    [[WIDE_VEC5:%.*]] = load <32 x float>, ptr [[TMP9]], align 4
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <32 x float> [[WIDE_VEC4]], <32 x float> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <32 x float> [[WIDE_VEC5]], <32 x float> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
-; INTERLEAVE-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[VEC_PHI]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; INTERLEAVE-NEXT:    [[TMP15:%.*]] = fadd fast <4 x float> [[VEC_PHI1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = fadd fast <4 x float> [[TMP14]], [[STRIDED_VEC]]
-; INTERLEAVE-NEXT:    [[TMP17:%.*]] = fadd fast <4 x float> [[TMP15]], [[STRIDED_VEC3]]
-; INTERLEAVE-NEXT:    [[TMP18]] = fadd fast <4 x float> [[TMP16]], [[STRIDED_VEC6]]
-; INTERLEAVE-NEXT:    [[TMP19]] = fadd fast <4 x float> [[TMP17]], [[STRIDED_VEC7]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_PHI]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_PHI1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[STRIDED_VEC]]
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = fadd fast <4 x float> [[TMP11]], [[STRIDED_VEC3]]
+; INTERLEAVE-NEXT:    [[TMP14]] = fadd fast <4 x float> [[TMP12]], [[STRIDED_VEC6]]
+; INTERLEAVE-NEXT:    [[TMP15]] = fadd fast <4 x float> [[TMP13]], [[STRIDED_VEC7]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; INTERLEAVE:       middle.block:
-; INTERLEAVE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP19]], [[TMP18]]
-; INTERLEAVE-NEXT:    [[TMP21:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
+; INTERLEAVE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP14]]
+; INTERLEAVE-NEXT:    [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
 ; INTERLEAVE-NEXT:    br label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
 ; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; INTERLEAVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label [[FOR_BODY:%.*]]
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I]]
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I]]
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4
-; INTERLEAVE-NEXT:    [[TMP26:%.*]] = fadd fast float [[S]], 1.000000e+00
-; INTERLEAVE-NEXT:    [[TMP27:%.*]] = fadd fast float [[TMP26]], [[TMP23]]
-; INTERLEAVE-NEXT:    [[TMP28]] = fadd fast float [[TMP27]], [[TMP25]]
+; INTERLEAVE-NEXT:    [[S:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP24:%.*]], [[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]]
+; INTERLEAVE-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4
+; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]]
+; INTERLEAVE-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = fadd fast float [[S]], 1.000000e+00
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = fadd fast float [[TMP22]], [[TMP19]]
+; INTERLEAVE-NEXT:    [[TMP24]] = fadd fast float [[TMP23]], [[TMP21]]
 ; INTERLEAVE-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 8
 ; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; INTERLEAVE-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
 ; INTERLEAVE:       for.end:
-; INTERLEAVE-NEXT:    ret float [[TMP28]]
+; INTERLEAVE-NEXT:    ret float [[TMP24]]
 ;
 entry:
   br label %for.body
@@ -1266,10 +1229,10 @@ entry:
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %s = phi float [ 0.0, %entry ], [ %6, %for.body ]
-  %0 = getelementptr inbounds float, float* %a, i64 %i
-  %1 = load float, float* %0, align 4
-  %2 = getelementptr inbounds float, float* %b, i64 %i
-  %3 = load float, float* %2, align 4
+  %0 = getelementptr inbounds float, ptr %a, i64 %i
+  %1 = load float, ptr %0, align 4
+  %2 = getelementptr inbounds float, ptr %b, i64 %i
+  %3 = load float, ptr %2, align 4
   %4 = fadd fast float %s, 1.0
   %5 = fadd fast float %4, %1
   %6 = fadd fast float %5, %3
@@ -1290,7 +1253,7 @@ for.end:
 ;
 
 %pair.i32 = type { i32, i32 }
-define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
+define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; CHECK-LABEL: @scalarize_induction_variable_03(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
@@ -1306,17 +1269,17 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
-; CHECK-NEXT:    store i32 [[TMP9]], i32* [[TMP2]], align 8
+; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
-; CHECK-NEXT:    store i32 [[TMP10]], i32* [[TMP3]], align 8
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -1328,10 +1291,10 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[F]], align 8
+; CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[F]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP12]], [[Y]]
-; CHECK-NEXT:    store i32 [[TMP13]], i32* [[F]], align 8
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[F]], align 8
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1351,17 +1314,17 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP0:%.*]] = or i64 [[INDEX]], 1
-; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 1
-; IND-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP0]], i32 1
-; IND-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 8
-; IND-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 8
+; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
+; IND-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP0]], i32 1
+; IND-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 8
+; IND-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 8
 ; IND-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
 ; IND-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
 ; IND-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
 ; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP7]], i64 0
-; IND-NEXT:    store i32 [[TMP8]], i32* [[TMP1]], align 8
+; IND-NEXT:    store i32 [[TMP8]], ptr [[TMP1]], align 8
 ; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i64 1
-; IND-NEXT:    store i32 [[TMP9]], i32* [[TMP2]], align 8
+; IND-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -1373,10 +1336,10 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 ; IND-NEXT:    br label [[FOR_BODY:%.*]]
 ; IND:       for.body:
 ; IND-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; IND-NEXT:    [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
-; IND-NEXT:    [[TMP11:%.*]] = load i32, i32* [[F]], align 8
+; IND-NEXT:    [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; IND-NEXT:    [[TMP11:%.*]] = load i32, ptr [[F]], align 8
 ; IND-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP11]], [[Y]]
-; IND-NEXT:    store i32 [[TMP12]], i32* [[F]], align 8
+; IND-NEXT:    store i32 [[TMP12]], ptr [[F]], align 8
 ; IND-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; IND-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; IND-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1400,28 +1363,28 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 ; UNROLL-NEXT:    [[TMP0:%.*]] = or i64 [[INDEX]], 1
 ; UNROLL-NEXT:    [[TMP1:%.*]] = or i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 3
-; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 1
-; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP0]], i32 1
-; UNROLL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP1]], i32 1
-; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
-; UNROLL-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 8
-; UNROLL-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 8
+; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
+; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP0]], i32 1
+; UNROLL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP1]], i32 1
+; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
+; UNROLL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 8
+; UNROLL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
 ; UNROLL-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0
 ; UNROLL-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i64 1
-; UNROLL-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP5]], align 8
-; UNROLL-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP6]], align 8
+; UNROLL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP5]], align 8
+; UNROLL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP6]], align 8
 ; UNROLL-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
 ; UNROLL-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
 ; UNROLL-NEXT:    [[TMP15:%.*]] = xor <2 x i32> [[TMP10]], [[BROADCAST_SPLAT]]
 ; UNROLL-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT2]]
 ; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP15]], i64 0
-; UNROLL-NEXT:    store i32 [[TMP17]], i32* [[TMP3]], align 8
+; UNROLL-NEXT:    store i32 [[TMP17]], ptr [[TMP3]], align 8
 ; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP15]], i64 1
-; UNROLL-NEXT:    store i32 [[TMP18]], i32* [[TMP4]], align 8
+; UNROLL-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
 ; UNROLL-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i64 0
-; UNROLL-NEXT:    store i32 [[TMP19]], i32* [[TMP5]], align 8
+; UNROLL-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
 ; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP16]], i64 1
-; UNROLL-NEXT:    store i32 [[TMP20]], i32* [[TMP6]], align 8
+; UNROLL-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -1433,10 +1396,10 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 ; UNROLL-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
-; UNROLL-NEXT:    [[TMP22:%.*]] = load i32, i32* [[F]], align 8
+; UNROLL-NEXT:    [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; UNROLL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[F]], align 8
 ; UNROLL-NEXT:    [[TMP23:%.*]] = xor i32 [[TMP22]], [[Y]]
-; UNROLL-NEXT:    store i32 [[TMP23]], i32* [[F]], align 8
+; UNROLL-NEXT:    store i32 [[TMP23]], ptr [[F]], align 8
 ; UNROLL-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; UNROLL-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; UNROLL-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1462,28 +1425,28 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[TMP0]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP1]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[TMP0]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP1]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP9]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP6]], align 8
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP7]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP6]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP7]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP13]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP11]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP18]], i32* [[TMP4]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP19]], i32* [[TMP5]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP20]], i32* [[TMP6]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP21]], i32* [[TMP7]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP21]], ptr [[TMP7]], align 8
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -1495,10 +1458,10 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i32, i32* [[F]], align 8
+; UNROLL-NO-IC-NEXT:    [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i32, ptr [[F]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP23]], [[Y]]
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP24]], i32* [[F]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP24]], ptr [[F]], align 8
 ; UNROLL-NO-IC-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; UNROLL-NO-IC-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1529,41 +1492,39 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 5
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = or i64 [[INDEX]], 6
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = or i64 [[INDEX]], 7
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 1
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1
-; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP6]], i32 1
-; INTERLEAVE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP7]], i32 1
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP8]], i32 1
-; INTERLEAVE-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
-; INTERLEAVE-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP13]] to <8 x i32>*
-; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 8
-; INTERLEAVE-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP18]], align 8
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 1
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]], i32 1
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP6]], i32 1
+; INTERLEAVE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP8]], i32 1
+; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 8
+; INTERLEAVE-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; INTERLEAVE-NEXT:    [[TMP19:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT4]]
-; INTERLEAVE-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP19]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP21]], i32* [[TMP9]], align 8
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP19]], i64 1
-; INTERLEAVE-NEXT:    store i32 [[TMP22]], i32* [[TMP10]], align 8
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP19]], i64 2
-; INTERLEAVE-NEXT:    store i32 [[TMP23]], i32* [[TMP11]], align 8
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i64 3
-; INTERLEAVE-NEXT:    store i32 [[TMP24]], i32* [[TMP12]], align 8
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP20]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP25]], i32* [[TMP13]], align 8
-; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP20]], i64 1
-; INTERLEAVE-NEXT:    store i32 [[TMP26]], i32* [[TMP14]], align 8
-; INTERLEAVE-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP20]], i64 2
-; INTERLEAVE-NEXT:    store i32 [[TMP27]], i32* [[TMP15]], align 8
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP20]], i64 3
-; INTERLEAVE-NEXT:    store i32 [[TMP28]], i32* [[TMP16]], align 8
+; INTERLEAVE-NEXT:    [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT4]]
+; INTERLEAVE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP17]], i64 0
+; INTERLEAVE-NEXT:    store i32 [[TMP19]], ptr [[TMP9]], align 8
+; INTERLEAVE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i64 1
+; INTERLEAVE-NEXT:    store i32 [[TMP20]], ptr [[TMP10]], align 8
+; INTERLEAVE-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i64 2
+; INTERLEAVE-NEXT:    store i32 [[TMP21]], ptr [[TMP11]], align 8
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP17]], i64 3
+; INTERLEAVE-NEXT:    store i32 [[TMP22]], ptr [[TMP12]], align 8
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i64 0
+; INTERLEAVE-NEXT:    store i32 [[TMP23]], ptr [[TMP13]], align 8
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i64 1
+; INTERLEAVE-NEXT:    store i32 [[TMP24]], ptr [[TMP14]], align 8
+; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i64 2
+; INTERLEAVE-NEXT:    store i32 [[TMP25]], ptr [[TMP15]], align 8
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i64 3
+; INTERLEAVE-NEXT:    store i32 [[TMP26]], ptr [[TMP16]], align 8
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    br label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
@@ -1571,10 +1532,10 @@ define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 ; INTERLEAVE-NEXT:    br label [[FOR_BODY:%.*]]
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
-; INTERLEAVE-NEXT:    [[TMP30:%.*]] = load i32, i32* [[F]], align 8
-; INTERLEAVE-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP30]], [[Y]]
-; INTERLEAVE-NEXT:    store i32 [[TMP31]], i32* [[F]], align 8
+; INTERLEAVE-NEXT:    [[F:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = load i32, ptr [[F]], align 8
+; INTERLEAVE-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP28]], [[Y]]
+; INTERLEAVE-NEXT:    store i32 [[TMP29]], ptr [[F]], align 8
 ; INTERLEAVE-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; INTERLEAVE-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1586,10 +1547,10 @@ entry:
 
 for.body:
   %i  = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %f = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
-  %0 = load i32, i32* %f, align 8
+  %f = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
+  %0 = load i32, ptr %f, align 8
   %1 = xor i32 %0, %y
-  store i32 %1, i32* %f, align 8
+  store i32 %1, ptr %f, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -1605,29 +1566,26 @@ for.end:
 ;   p[i].f = a[i * 4]
 ;
 
-define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
+define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; CHECK-LABEL: @scalarize_induction_variable_04(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]]
-; CHECK-NEXT:    [[SCEVGEP23:%.*]] = bitcast %pair.i32* [[SCEVGEP2]] to i8*
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP4]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 1
-; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP56]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[A4]], [[SCEVGEP23]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 8
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 4
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -1637,23 +1595,23 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 1, !alias.scope !17
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP14]], align 1, !alias.scope !17
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1
-; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP17]], align 1, !alias.scope !20, !noalias !17
-; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP18]], align 1, !alias.scope !20, !noalias !17
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP13]], align 1, !alias.scope !17
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope !17
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP18]], align 1, !alias.scope !20, !noalias !17
+; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP19]], align 1, !alias.scope !20, !noalias !17
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1662,14 +1620,14 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = shl nsw i64 [[I]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
-; CHECK-NEXT:    store i32 [[TMP22]], i32* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = shl nsw i64 [[I]], 2
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[I_NEXT]] to i32
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP24]], [[N]]
+; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[I_NEXT]] to i32
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP25]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
@@ -1682,16 +1640,17 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; IND:       vector.memcheck:
-; IND-NEXT:    [[SCEVGEP:%.*]] = getelementptr [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 0, i32 1
+; IND-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4
 ; IND-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
 ; IND-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; IND-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
-; IND-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP4]], 2
-; IND-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], 1
-; IND-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP7]]
-; IND-NEXT:    [[BOUND0:%.*]] = icmp ult i32* [[SCEVGEP]], [[SCEVGEP5]]
-; IND-NEXT:    [[TMP8:%.*]] = getelementptr [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 0
-; IND-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP8]], [[A]]
+; IND-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; IND-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 8
+; IND-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
+; IND-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; IND-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], 4
+; IND-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]]
+; IND-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]]
+; IND-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]]
 ; IND-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; IND-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
@@ -1703,15 +1662,15 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; IND-NEXT:    [[TMP9:%.*]] = or i64 [[INDEX]], 1
 ; IND-NEXT:    [[TMP10:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 2, i64 2>
 ; IND-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i64 0
-; IND-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]]
+; IND-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; IND-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i64 1
-; IND-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP13]]
-; IND-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 1, !alias.scope !17
-; IND-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP14]], align 1, !alias.scope !17
-; IND-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
-; IND-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1
-; IND-NEXT:    store i32 [[TMP15]], i32* [[TMP17]], align 1, !alias.scope !20, !noalias !17
-; IND-NEXT:    store i32 [[TMP16]], i32* [[TMP18]], align 1, !alias.scope !20, !noalias !17
+; IND-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
+; IND-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP12]], align 1, !alias.scope !17
+; IND-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1, !alias.scope !17
+; IND-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1
+; IND-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1
+; IND-NEXT:    store i32 [[TMP15]], ptr [[TMP17]], align 1, !alias.scope !20, !noalias !17
+; IND-NEXT:    store i32 [[TMP16]], ptr [[TMP18]], align 1, !alias.scope !20, !noalias !17
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
 ; IND-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1725,10 +1684,10 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; IND:       for.body:
 ; IND-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; IND-NEXT:    [[TMP20:%.*]] = shl nsw i64 [[I]], 2
-; IND-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
-; IND-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 1
-; IND-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
-; IND-NEXT:    store i32 [[TMP22]], i32* [[TMP23]], align 1
+; IND-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
+; IND-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 1
+; IND-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; IND-NEXT:    store i32 [[TMP22]], ptr [[TMP23]], align 1
 ; IND-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; IND-NEXT:    [[TMP24:%.*]] = trunc i64 [[I_NEXT]] to i32
 ; IND-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP24]], [[N]]
@@ -1744,16 +1703,17 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
 ; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; UNROLL:       vector.memcheck:
-; UNROLL-NEXT:    [[SCEVGEP:%.*]] = getelementptr [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 0, i32 1
+; UNROLL-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4
 ; UNROLL-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
 ; UNROLL-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; UNROLL-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
-; UNROLL-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP4]], 2
-; UNROLL-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], 1
-; UNROLL-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP7]]
-; UNROLL-NEXT:    [[BOUND0:%.*]] = icmp ult i32* [[SCEVGEP]], [[SCEVGEP5]]
-; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 0
-; UNROLL-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP8]], [[A]]
+; UNROLL-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; UNROLL-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 8
+; UNROLL-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
+; UNROLL-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; UNROLL-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], 4
+; UNROLL-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]]
+; UNROLL-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]]
+; UNROLL-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]]
 ; UNROLL-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; UNROLL-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
@@ -1769,25 +1729,25 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], <i64 2, i64 2>
 ; UNROLL-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[STEP_ADD]], <i64 8, i64 8>
 ; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i64 0
-; UNROLL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]]
+; UNROLL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
 ; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i64 1
-; UNROLL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
+; UNROLL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
 ; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0
-; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]]
+; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
 ; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1
-; UNROLL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
-; UNROLL-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP15]], align 1, !alias.scope !17
-; UNROLL-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP17]], align 1, !alias.scope !17
-; UNROLL-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP19]], align 1, !alias.scope !17
-; UNROLL-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP21]], align 1, !alias.scope !17
-; UNROLL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
-; UNROLL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1
-; UNROLL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP10]], i32 1
-; UNROLL-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1
-; UNROLL-NEXT:    store i32 [[TMP22]], i32* [[TMP26]], align 1, !alias.scope !20, !noalias !17
-; UNROLL-NEXT:    store i32 [[TMP23]], i32* [[TMP27]], align 1, !alias.scope !20, !noalias !17
-; UNROLL-NEXT:    store i32 [[TMP24]], i32* [[TMP28]], align 1, !alias.scope !20, !noalias !17
-; UNROLL-NEXT:    store i32 [[TMP25]], i32* [[TMP29]], align 1, !alias.scope !20, !noalias !17
+; UNROLL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
+; UNROLL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope !17
+; UNROLL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP17]], align 1, !alias.scope !17
+; UNROLL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope !17
+; UNROLL-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP21]], align 1, !alias.scope !17
+; UNROLL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1
+; UNROLL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]], i32 1
+; UNROLL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1
+; UNROLL-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
+; UNROLL-NEXT:    store i32 [[TMP22]], ptr [[TMP26]], align 1, !alias.scope !20, !noalias !17
+; UNROLL-NEXT:    store i32 [[TMP23]], ptr [[TMP27]], align 1, !alias.scope !20, !noalias !17
+; UNROLL-NEXT:    store i32 [[TMP24]], ptr [[TMP28]], align 1, !alias.scope !20, !noalias !17
+; UNROLL-NEXT:    store i32 [[TMP25]], ptr [[TMP29]], align 1, !alias.scope !20, !noalias !17
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 4, i64 4>
 ; UNROLL-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1801,10 +1761,10 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[TMP31:%.*]] = shl nsw i64 [[I]], 2
-; UNROLL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]]
-; UNROLL-NEXT:    [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 1
-; UNROLL-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
-; UNROLL-NEXT:    store i32 [[TMP33]], i32* [[TMP34]], align 1
+; UNROLL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]]
+; UNROLL-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 1
+; UNROLL-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; UNROLL-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 1
 ; UNROLL-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; UNROLL-NEXT:    [[TMP35:%.*]] = trunc i64 [[I_NEXT]] to i32
 ; UNROLL-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP35]], [[N]]
@@ -1814,26 +1774,23 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ;
 ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_04(
 ; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; UNROLL-NO-IC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
 ; UNROLL-NO-IC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; UNROLL-NO-IC:       vector.memcheck:
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP:%.*]] = getelementptr [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 0, i32 1
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; UNROLL-NO-IC-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP2:%.*]] = getelementptr [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]]
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP23:%.*]] = bitcast %pair.i32* [[SCEVGEP2]] to i8*
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP4]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 1
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP7]]
-; UNROLL-NO-IC-NEXT:    [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8*
-; UNROLL-NO-IC-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP56]]
-; UNROLL-NO-IC-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[A4]], [[SCEVGEP23]]
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 8
+; UNROLL-NO-IC-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 4
+; UNROLL-NO-IC-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]]
+; UNROLL-NO-IC-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]]
+; UNROLL-NO-IC-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]]
 ; UNROLL-NO-IC-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-IC:       vector.ph:
@@ -1844,36 +1801,36 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 3
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = shl nsw <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]]
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]]
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
-; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP15]], align 1, !alias.scope !17
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP17]], align 1, !alias.scope !17
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP19]], align 1, !alias.scope !17
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP21]], align 1, !alias.scope !17
-; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP8]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP10]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP22]], i32* [[TMP26]], align 1, !alias.scope !20, !noalias !17
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP23]], i32* [[TMP27]], align 1, !alias.scope !20, !noalias !17
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP24]], i32* [[TMP28]], align 1, !alias.scope !20, !noalias !17
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP25]], i32* [[TMP29]], align 1, !alias.scope !20, !noalias !17
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 3
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = shl nsw <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = shl nsw <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
+; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP16]], align 1, !alias.scope !17
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP18]], align 1, !alias.scope !17
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope !17
+; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP22]], align 1, !alias.scope !17
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP23]], ptr [[TMP27]], align 1, !alias.scope !20, !noalias !17
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP24]], ptr [[TMP28]], align 1, !alias.scope !20, !noalias !17
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP25]], ptr [[TMP29]], align 1, !alias.scope !20, !noalias !17
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP26]], ptr [[TMP30]], align 1, !alias.scope !20, !noalias !17
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
-; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1882,14 +1839,14 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = shl nsw i64 [[I]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]]
-; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 1
-; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP33]], i32* [[TMP34]], align 1
+; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = shl nsw i64 [[I]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP32]]
+; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 1
+; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP34]], ptr [[TMP35]], align 1
 ; UNROLL-NO-IC-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = trunc i64 [[I_NEXT]] to i32
-; UNROLL-NO-IC-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP35]], [[N]]
+; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = trunc i64 [[I_NEXT]] to i32
+; UNROLL-NO-IC-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP36]], [[N]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; UNROLL-NO-IC:       for.end:
 ; UNROLL-NO-IC-NEXT:    ret void
@@ -1902,16 +1859,17 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 8
 ; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; INTERLEAVE:       vector.memcheck:
-; INTERLEAVE-NEXT:    [[SCEVGEP:%.*]] = getelementptr [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 0, i32 1
+; INTERLEAVE-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP4]], 2
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], 1
-; INTERLEAVE-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP7]]
-; INTERLEAVE-NEXT:    [[BOUND0:%.*]] = icmp ult i32* [[SCEVGEP]], [[SCEVGEP5]]
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 0
-; INTERLEAVE-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP8]], [[A]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 8
+; INTERLEAVE-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], 4
+; INTERLEAVE-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]]
+; INTERLEAVE-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]]
+; INTERLEAVE-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]]
 ; INTERLEAVE-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; INTERLEAVE-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
@@ -1931,39 +1889,37 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = or i64 [[INDEX]], 7
 ; INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[INDEX]], 2
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP14]], 2
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]]
-; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP19]]
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>*
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP21]] to <16 x i32>*
-; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP22]], align 1
-; INTERLEAVE-NEXT:    [[WIDE_VEC7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP23]], align 1
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1
-; INTERLEAVE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP12]], i32 1
-; INTERLEAVE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP13]], i32 1
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP14]], i32 1
-; INTERLEAVE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP15]], i32 1
-; INTERLEAVE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP16]], i32 1
-; INTERLEAVE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP17]], i32 1
-; INTERLEAVE-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP32]], i32* [[TMP24]], align 1, !alias.scope !17, !noalias !20
-; INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4
-; INTERLEAVE-NEXT:    store i32 [[TMP33]], i32* [[TMP25]], align 1, !alias.scope !17, !noalias !20
-; INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8
-; INTERLEAVE-NEXT:    store i32 [[TMP34]], i32* [[TMP26]], align 1, !alias.scope !17, !noalias !20
-; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12
-; INTERLEAVE-NEXT:    store i32 [[TMP35]], i32* [[TMP27]], align 1, !alias.scope !17, !noalias !20
-; INTERLEAVE-NEXT:    [[TMP36:%.*]] = extractelement <16 x i32> [[WIDE_VEC7]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP36]], i32* [[TMP28]], align 1, !alias.scope !17, !noalias !20
-; INTERLEAVE-NEXT:    [[TMP37:%.*]] = extractelement <16 x i32> [[WIDE_VEC7]], i64 4
-; INTERLEAVE-NEXT:    store i32 [[TMP37]], i32* [[TMP29]], align 1, !alias.scope !17, !noalias !20
-; INTERLEAVE-NEXT:    [[TMP38:%.*]] = extractelement <16 x i32> [[WIDE_VEC7]], i64 8
-; INTERLEAVE-NEXT:    store i32 [[TMP38]], i32* [[TMP30]], align 1, !alias.scope !17, !noalias !20
-; INTERLEAVE-NEXT:    [[TMP39:%.*]] = extractelement <16 x i32> [[WIDE_VEC7]], i64 12
-; INTERLEAVE-NEXT:    store i32 [[TMP39]], i32* [[TMP31]], align 1, !alias.scope !17, !noalias !20
+; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
+; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
+; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP20]], align 1
+; INTERLEAVE-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP21]], align 1
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]], i32 1
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP12]], i32 1
+; INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP13]], i32 1
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP14]], i32 1
+; INTERLEAVE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP15]], i32 1
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP16]], i32 1
+; INTERLEAVE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP17]], i32 1
+; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0
+; INTERLEAVE-NEXT:    store i32 [[TMP30]], ptr [[TMP22]], align 1, !alias.scope !17, !noalias !20
+; INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4
+; INTERLEAVE-NEXT:    store i32 [[TMP31]], ptr [[TMP23]], align 1, !alias.scope !17, !noalias !20
+; INTERLEAVE-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8
+; INTERLEAVE-NEXT:    store i32 [[TMP32]], ptr [[TMP24]], align 1, !alias.scope !17, !noalias !20
+; INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12
+; INTERLEAVE-NEXT:    store i32 [[TMP33]], ptr [[TMP25]], align 1, !alias.scope !17, !noalias !20
+; INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0
+; INTERLEAVE-NEXT:    store i32 [[TMP34]], ptr [[TMP26]], align 1, !alias.scope !17, !noalias !20
+; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4
+; INTERLEAVE-NEXT:    store i32 [[TMP35]], ptr [[TMP27]], align 1, !alias.scope !17, !noalias !20
+; INTERLEAVE-NEXT:    [[TMP36:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8
+; INTERLEAVE-NEXT:    store i32 [[TMP36]], ptr [[TMP28]], align 1, !alias.scope !17, !noalias !20
+; INTERLEAVE-NEXT:    [[TMP37:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12
+; INTERLEAVE-NEXT:    store i32 [[TMP37]], ptr [[TMP29]], align 1, !alias.scope !17, !noalias !20
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    br label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
@@ -1971,14 +1927,14 @@ define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 ; INTERLEAVE-NEXT:    br label [[FOR_BODY:%.*]]
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[TMP41:%.*]] = shl nsw i64 [[I]], 2
-; INTERLEAVE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP41]]
-; INTERLEAVE-NEXT:    [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 1
-; INTERLEAVE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
-; INTERLEAVE-NEXT:    store i32 [[TMP43]], i32* [[TMP44]], align 1
+; INTERLEAVE-NEXT:    [[TMP39:%.*]] = shl nsw i64 [[I]], 2
+; INTERLEAVE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP39]]
+; INTERLEAVE-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 1
+; INTERLEAVE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
+; INTERLEAVE-NEXT:    store i32 [[TMP41]], ptr [[TMP42]], align 1
 ; INTERLEAVE-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; INTERLEAVE-NEXT:    [[TMP45:%.*]] = trunc i64 [[I_NEXT]] to i32
-; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP45]], [[N]]
+; INTERLEAVE-NEXT:    [[TMP43:%.*]] = trunc i64 [[I_NEXT]] to i32
+; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP43]], [[N]]
 ; INTERLEAVE-NEXT:    br i1 [[COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; INTERLEAVE:       for.end:
 ; INTERLEAVE-NEXT:    ret void
@@ -1989,10 +1945,10 @@ entry:
 for.body:
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry]
   %0 = shl nsw i64 %i, 2
-  %1 = getelementptr inbounds i32, i32* %a, i64 %0
-  %2 = load i32, i32* %1, align 1
-  %3 = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
-  store i32 %2, i32* %3, align 1
+  %1 = getelementptr inbounds i32, ptr %a, i64 %0
+  %2 = load i32, ptr %1, align 1
+  %3 = getelementptr inbounds %pair.i32, ptr %p, i64 %i, i32 1
+  store i32 %2, ptr %3, align 1
   %i.next = add nuw nsw i64 %i, 1
   %4 = trunc i64 %i.next to i32
   %cond = icmp eq i32 %4, %n
@@ -2018,7 +1974,7 @@ for.end:
 ;
 ;
 
-define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
+define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-LABEL: @scalarize_induction_variable_05(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 1)
@@ -2032,50 +1988,49 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[PRED_UDIV_CONTINUE2]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_UDIV_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; CHECK:       pred.udiv.continue:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.if1:
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.continue2:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF1]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]]
-; CHECK-NEXT:    [[TMP16]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP15]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP15]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ]
-; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I]]
-; CHECK-NEXT:    [[VAR1:%.*]] = load i32, i32* [[VAR0]], align 4
+; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I]]
+; CHECK-NEXT:    [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4
 ; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[VAR2:%.*]] = udiv i32 [[VAR1]], [[I]]
@@ -2087,7 +2042,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[VAR5]]
 ;
 ; IND-LABEL: @scalarize_induction_variable_05(
@@ -2101,49 +2056,48 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
-; IND-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_UDIV_CONTINUE2]] ]
+; IND-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_UDIV_CONTINUE2]] ]
 ; IND-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
-; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; IND-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>*
-; IND-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
+; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IND-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; IND-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; IND:       pred.udiv.if:
-; IND-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0
-; IND-NEXT:    [[TMP4:%.*]] = udiv i32 [[TMP3]], [[INDEX]]
-; IND-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i64 0
+; IND-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0
+; IND-NEXT:    [[TMP3:%.*]] = udiv i32 [[TMP2]], [[INDEX]]
+; IND-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
 ; IND-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; IND:       pred.udiv.continue:
-; IND-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ]
+; IND-NEXT:    [[TMP5:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ]
 ; IND-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
 ; IND:       pred.udiv.if1:
-; IND-NEXT:    [[TMP7:%.*]] = or i32 [[INDEX]], 1
-; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1
-; IND-NEXT:    [[TMP9:%.*]] = udiv i32 [[TMP8]], [[TMP7]]
-; IND-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP9]], i64 1
+; IND-NEXT:    [[TMP6:%.*]] = or i32 [[INDEX]], 1
+; IND-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1
+; IND-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP7]], [[TMP6]]
+; IND-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP8]], i64 1
 ; IND-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; IND:       pred.udiv.continue2:
-; IND-NEXT:    [[TMP11:%.*]] = phi <2 x i32> [ [[TMP6]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF1]] ]
-; IND-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
-; IND-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i1> [[TMP12]], <2 x i1> poison, <2 x i32> zeroinitializer
-; IND-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP13]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]]
-; IND-NEXT:    [[TMP14]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
+; IND-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ [[TMP5]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ]
+; IND-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; IND-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer
+; IND-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP10]]
+; IND-NEXT:    [[TMP13]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; IND-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; IND-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; IND:       middle.block:
-; IND-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP14]])
+; IND-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP13]])
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; IND:       scalar.ph:
 ; IND-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; IND-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IND-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; IND-NEXT:    br label [[FOR_BODY:%.*]]
 ; IND:       for.body:
 ; IND-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; IND-NEXT:    [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ]
-; IND-NEXT:    [[TMP17:%.*]] = zext i32 [[I]] to i64
-; IND-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; IND-NEXT:    [[VAR1:%.*]] = load i32, i32* [[VAR0]], align 4
+; IND-NEXT:    [[TMP16:%.*]] = zext i32 [[I]] to i64
+; IND-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
+; IND-NEXT:    [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4
 ; IND-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
 ; IND:       if.then:
 ; IND-NEXT:    [[VAR2:%.*]] = udiv i32 [[VAR1]], [[I]]
@@ -2155,7 +2109,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; IND-NEXT:    [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
 ; IND-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]]
 ; IND:       for.end:
-; IND-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; IND-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
 ; IND-NEXT:    ret i32 [[VAR5]]
 ;
 ; UNROLL-LABEL: @scalarize_induction_variable_05(
@@ -2170,76 +2124,74 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE10:%.*]] ]
-; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_UDIV_CONTINUE10]] ]
-; UNROLL-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_UDIV_CONTINUE10]] ]
+; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_UDIV_CONTINUE10]] ]
+; UNROLL-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_UDIV_CONTINUE10]] ]
 ; UNROLL-NEXT:    [[TMP0:%.*]] = or i32 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 2
-; UNROLL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; UNROLL-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
+; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2
+; UNROLL-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
 ; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL:       pred.udiv.if:
-; UNROLL-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0
-; UNROLL-NEXT:    [[TMP7:%.*]] = udiv i32 [[TMP6]], [[INDEX]]
-; UNROLL-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0
+; UNROLL-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0
+; UNROLL-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[INDEX]]
+; UNROLL-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0
 ; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL:       pred.udiv.continue:
-; UNROLL-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
+; UNROLL-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
 ; UNROLL:       pred.udiv.if3:
-; UNROLL-NEXT:    [[TMP10:%.*]] = or i32 [[INDEX]], 1
-; UNROLL-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1
-; UNROLL-NEXT:    [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]]
-; UNROLL-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i64 1
+; UNROLL-NEXT:    [[TMP8:%.*]] = or i32 [[INDEX]], 1
+; UNROLL-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1
+; UNROLL-NEXT:    [[TMP10:%.*]] = udiv i32 [[TMP9]], [[TMP8]]
+; UNROLL-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP10]], i64 1
 ; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
 ; UNROLL:       pred.udiv.continue4:
-; UNROLL-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF3]] ]
+; UNROLL-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP11]], [[PRED_UDIV_IF3]] ]
 ; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
 ; UNROLL:       pred.udiv.if7:
-; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 0
-; UNROLL-NEXT:    [[TMP16:%.*]] = udiv i32 [[TMP15]], [[TMP0]]
-; UNROLL-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i64 0
+; UNROLL-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 0
+; UNROLL-NEXT:    [[TMP14:%.*]] = udiv i32 [[TMP13]], [[TMP0]]
+; UNROLL-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP14]], i64 0
 ; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL:       pred.udiv.continue8:
-; UNROLL-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ]
+; UNROLL-NEXT:    [[TMP16:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP15]], [[PRED_UDIV_IF7]] ]
 ; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]]
 ; UNROLL:       pred.udiv.if9:
-; UNROLL-NEXT:    [[TMP19:%.*]] = or i32 [[INDEX]], 3
-; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 1
-; UNROLL-NEXT:    [[TMP21:%.*]] = udiv i32 [[TMP20]], [[TMP19]]
-; UNROLL-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[TMP21]], i64 1
+; UNROLL-NEXT:    [[TMP17:%.*]] = or i32 [[INDEX]], 3
+; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 1
+; UNROLL-NEXT:    [[TMP19:%.*]] = udiv i32 [[TMP18]], [[TMP17]]
+; UNROLL-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP19]], i64 1
 ; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
 ; UNROLL:       pred.udiv.continue10:
-; UNROLL-NEXT:    [[TMP23:%.*]] = phi <2 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ]
-; UNROLL-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; UNROLL-NEXT:    [[TMP21:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP20]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NEXT:    [[TMP22:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; UNROLL-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i1> [[TMP22]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT5]], <i1 true, i1 poison>
 ; UNROLL-NEXT:    [[TMP25:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[TMP26:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT5]], <i1 true, i1 poison>
-; UNROLL-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i1> [[TMP26]], <2 x i1> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]]
-; UNROLL-NEXT:    [[PREDPHI11:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP23]]
-; UNROLL-NEXT:    [[TMP28]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
-; UNROLL-NEXT:    [[TMP29]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]]
+; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP23]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP12]]
+; UNROLL-NEXT:    [[PREDPHI11:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP21]]
+; UNROLL-NEXT:    [[TMP26]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
+; UNROLL-NEXT:    [[TMP27]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; UNROLL-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[TMP29]], [[TMP28]]
-; UNROLL-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
+; UNROLL-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[TMP27]], [[TMP26]]
+; UNROLL-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; UNROLL-NEXT:    [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ]
-; UNROLL-NEXT:    [[TMP32:%.*]] = zext i32 [[I]] to i64
-; UNROLL-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP32]]
-; UNROLL-NEXT:    [[VAR1:%.*]] = load i32, i32* [[VAR0]], align 4
+; UNROLL-NEXT:    [[TMP30:%.*]] = zext i32 [[I]] to i64
+; UNROLL-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP30]]
+; UNROLL-NEXT:    [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4
 ; UNROLL-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
 ; UNROLL:       if.then:
 ; UNROLL-NEXT:    [[VAR2:%.*]] = udiv i32 [[VAR1]], [[I]]
@@ -2251,7 +2203,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
 ; UNROLL-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]]
 ; UNROLL:       for.end:
-; UNROLL-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NEXT:    ret i32 [[VAR5]]
 ;
 ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_05(
@@ -2269,79 +2221,77 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE10:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[PRED_UDIV_CONTINUE10]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[PRED_UDIV_CONTINUE10]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[PRED_UDIV_CONTINUE10]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[PRED_UDIV_CONTINUE10]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP8]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP6]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if:
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = udiv i32 [[TMP9]], [[TMP0]]
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP7]], [[TMP0]]
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL-NO-IC:       pred.udiv.continue:
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_UDIV_IF]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP13]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UDIV_IF]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP11]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if3:
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX]], 1
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = udiv i32 [[TMP15]], [[TMP14]]
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP16]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 1
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = udiv i32 [[TMP13]], [[TMP12]]
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP14]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
 ; UNROLL-NO-IC:       pred.udiv.continue4:
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF3]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = phi <2 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF3]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 0
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP17]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if7:
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = udiv i32 [[TMP20]], [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = udiv i32 [[TMP18]], [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i32 0
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL-NO-IC:       pred.udiv.continue8:
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP22]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 1
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP22]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]]
 ; UNROLL-NO-IC:       pred.udiv.if9:
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = add i32 [[INDEX]], 3
-; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = udiv i32 [[TMP26]], [[TMP25]]
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP27]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = add i32 [[INDEX]], 3
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = udiv i32 [[TMP24]], [[TMP23]]
+; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP25]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
 ; UNROLL-NO-IC:       pred.udiv.continue10:
-; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = phi <2 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP28]], [[PRED_UDIV_IF9]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
-; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT6]], <i1 true, i1 true>
-; UNROLL-NO-IC-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP18]]
-; UNROLL-NO-IC-NEXT:    [[PREDPHI11:%.*]] = select <2 x i1> [[TMP31]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP29]]
-; UNROLL-NO-IC-NEXT:    [[TMP32]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
-; UNROLL-NO-IC-NEXT:    [[TMP33]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]]
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = phi <2 x i32> [ [[TMP21]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP26]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
+; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT6]], <i1 true, i1 true>
+; UNROLL-NO-IC-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP16]]
+; UNROLL-NO-IC-NEXT:    [[PREDPHI11:%.*]] = select <2 x i1> [[TMP29]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP27]]
+; UNROLL-NO-IC-NEXT:    [[TMP30]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
+; UNROLL-NO-IC-NEXT:    [[TMP31]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[TMP33]], [[TMP32]]
-; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
+; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[TMP31]], [[TMP30]]
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; UNROLL-NO-IC-NEXT:    [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I]]
-; UNROLL-NO-IC-NEXT:    [[VAR1:%.*]] = load i32, i32* [[VAR0]], align 4
+; UNROLL-NO-IC-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I]]
+; UNROLL-NO-IC-NEXT:    [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4
 ; UNROLL-NO-IC-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
 ; UNROLL-NO-IC:       if.then:
 ; UNROLL-NO-IC-NEXT:    [[VAR2:%.*]] = udiv i32 [[VAR1]], [[I]]
@@ -2353,7 +2303,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]]
 ; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    ret i32 [[VAR5]]
 ;
 ; INTERLEAVE-LABEL: @scalarize_induction_variable_05(
@@ -2368,112 +2318,110 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ]
-; INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; INTERLEAVE-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_UDIV_CONTINUE18]] ]
+; INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE18]] ]
+; INTERLEAVE-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE18]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = or i32 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; INTERLEAVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 4
+; INTERLEAVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; INTERLEAVE:       pred.udiv.if:
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = udiv i32 [[TMP6]], [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; INTERLEAVE:       pred.udiv.continue:
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
 ; INTERLEAVE:       pred.udiv.if3:
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = or i32 [[INDEX]], 1
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]]
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP12]], i64 1
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = or i32 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = udiv i32 [[TMP9]], [[TMP8]]
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 1
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
 ; INTERLEAVE:       pred.udiv.continue4:
-; INTERLEAVE-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF3]] ]
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = phi <4 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP11]], [[PRED_UDIV_IF3]] ]
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
 ; INTERLEAVE:       pred.udiv.if5:
-; INTERLEAVE-NEXT:    [[TMP15:%.*]] = or i32 [[INDEX]], 2
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2
-; INTERLEAVE-NEXT:    [[TMP17:%.*]] = udiv i32 [[TMP16]], [[TMP15]]
-; INTERLEAVE-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP17]], i64 2
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = or i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2
+; INTERLEAVE-NEXT:    [[TMP15:%.*]] = udiv i32 [[TMP14]], [[TMP13]]
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP15]], i64 2
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
 ; INTERLEAVE:       pred.udiv.continue6:
-; INTERLEAVE-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP18]], [[PRED_UDIV_IF5]] ]
+; INTERLEAVE-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP16]], [[PRED_UDIV_IF5]] ]
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
 ; INTERLEAVE:       pred.udiv.if7:
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = or i32 [[INDEX]], 3
-; INTERLEAVE-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = udiv i32 [[TMP21]], [[TMP20]]
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP22]], i64 3
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = or i32 [[INDEX]], 3
+; INTERLEAVE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
+; INTERLEAVE-NEXT:    [[TMP20:%.*]] = udiv i32 [[TMP19]], [[TMP18]]
+; INTERLEAVE-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i64 3
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; INTERLEAVE:       pred.udiv.continue8:
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP23]], [[PRED_UDIV_IF7]] ]
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ]
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
 ; INTERLEAVE:       pred.udiv.if11:
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 0
-; INTERLEAVE-NEXT:    [[TMP26:%.*]] = udiv i32 [[TMP25]], [[TMP0]]
-; INTERLEAVE-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i64 0
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 0
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = udiv i32 [[TMP23]], [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP24]], i64 0
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
 ; INTERLEAVE:       pred.udiv.continue12:
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ]
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP25]], [[PRED_UDIV_IF11]] ]
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
 ; INTERLEAVE:       pred.udiv.if13:
-; INTERLEAVE-NEXT:    [[TMP29:%.*]] = or i32 [[INDEX]], 5
-; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 1
-; INTERLEAVE-NEXT:    [[TMP31:%.*]] = udiv i32 [[TMP30]], [[TMP29]]
-; INTERLEAVE-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i64 1
+; INTERLEAVE-NEXT:    [[TMP27:%.*]] = or i32 [[INDEX]], 5
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 1
+; INTERLEAVE-NEXT:    [[TMP29:%.*]] = udiv i32 [[TMP28]], [[TMP27]]
+; INTERLEAVE-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP29]], i64 1
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
 ; INTERLEAVE:       pred.udiv.continue14:
-; INTERLEAVE-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ]
+; INTERLEAVE-NEXT:    [[TMP31:%.*]] = phi <4 x i32> [ [[TMP26]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP30]], [[PRED_UDIV_IF13]] ]
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
 ; INTERLEAVE:       pred.udiv.if15:
-; INTERLEAVE-NEXT:    [[TMP34:%.*]] = or i32 [[INDEX]], 6
-; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 2
-; INTERLEAVE-NEXT:    [[TMP36:%.*]] = udiv i32 [[TMP35]], [[TMP34]]
-; INTERLEAVE-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i64 2
+; INTERLEAVE-NEXT:    [[TMP32:%.*]] = or i32 [[INDEX]], 6
+; INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 2
+; INTERLEAVE-NEXT:    [[TMP34:%.*]] = udiv i32 [[TMP33]], [[TMP32]]
+; INTERLEAVE-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP34]], i64 2
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
 ; INTERLEAVE:       pred.udiv.continue16:
-; INTERLEAVE-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ]
+; INTERLEAVE-NEXT:    [[TMP36:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP35]], [[PRED_UDIV_IF15]] ]
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]]
 ; INTERLEAVE:       pred.udiv.if17:
-; INTERLEAVE-NEXT:    [[TMP39:%.*]] = or i32 [[INDEX]], 7
-; INTERLEAVE-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 3
-; INTERLEAVE-NEXT:    [[TMP41:%.*]] = udiv i32 [[TMP40]], [[TMP39]]
-; INTERLEAVE-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i64 3
+; INTERLEAVE-NEXT:    [[TMP37:%.*]] = or i32 [[INDEX]], 7
+; INTERLEAVE-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 3
+; INTERLEAVE-NEXT:    [[TMP39:%.*]] = udiv i32 [[TMP38]], [[TMP37]]
+; INTERLEAVE-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP39]], i64 3
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE18]]
 ; INTERLEAVE:       pred.udiv.continue18:
-; INTERLEAVE-NEXT:    [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ]
-; INTERLEAVE-NEXT:    [[TMP44:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison, i1 poison, i1 poison>
+; INTERLEAVE-NEXT:    [[TMP41:%.*]] = phi <4 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP40]], [[PRED_UDIV_IF17]] ]
+; INTERLEAVE-NEXT:    [[TMP42:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison, i1 poison, i1 poison>
+; INTERLEAVE-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i1> [[TMP42]], <4 x i1> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT:    [[TMP44:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT9]], <i1 true, i1 poison, i1 poison, i1 poison>
 ; INTERLEAVE-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i1> [[TMP44]], <4 x i1> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[TMP46:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT9]], <i1 true, i1 poison, i1 poison, i1 poison>
-; INTERLEAVE-NEXT:    [[TMP47:%.*]] = shufflevector <4 x i1> [[TMP46]], <4 x i1> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP45]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP24]]
-; INTERLEAVE-NEXT:    [[PREDPHI19:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP43]]
-; INTERLEAVE-NEXT:    [[TMP48]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]]
-; INTERLEAVE-NEXT:    [[TMP49]] = add <4 x i32> [[PREDPHI19]], [[VEC_PHI1]]
+; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP43]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP22]]
+; INTERLEAVE-NEXT:    [[PREDPHI19:%.*]] = select <4 x i1> [[TMP45]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP41]]
+; INTERLEAVE-NEXT:    [[TMP46]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]]
+; INTERLEAVE-NEXT:    [[TMP47]] = add <4 x i32> [[PREDPHI19]], [[VEC_PHI1]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; INTERLEAVE:       middle.block:
-; INTERLEAVE-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]]
-; INTERLEAVE-NEXT:    [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; INTERLEAVE-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP47]], [[TMP46]]
+; INTERLEAVE-NEXT:    [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SMAX]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
 ; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; INTERLEAVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP51]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP49]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label [[FOR_BODY:%.*]]
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; INTERLEAVE-NEXT:    [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ]
-; INTERLEAVE-NEXT:    [[TMP52:%.*]] = zext i32 [[I]] to i64
-; INTERLEAVE-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP52]]
-; INTERLEAVE-NEXT:    [[VAR1:%.*]] = load i32, i32* [[VAR0]], align 4
+; INTERLEAVE-NEXT:    [[TMP50:%.*]] = zext i32 [[I]] to i64
+; INTERLEAVE-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP50]]
+; INTERLEAVE-NEXT:    [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
 ; INTERLEAVE:       if.then:
 ; INTERLEAVE-NEXT:    [[VAR2:%.*]] = udiv i32 [[VAR1]], [[I]]
@@ -2485,7 +2433,7 @@ define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
 ; INTERLEAVE-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]]
 ; INTERLEAVE:       for.end:
-; INTERLEAVE-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR4]], [[IF_END]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ]
 ; INTERLEAVE-NEXT:    ret i32 [[VAR5]]
 ;
 entry:
@@ -2494,8 +2442,8 @@ entry:
 for.body:
   %i = phi i32 [ 0, %entry ], [ %i.next, %if.end ]
   %sum = phi i32 [ 0, %entry ], [ %var4, %if.end ]
-  %var0 = getelementptr inbounds i32, i32* %a, i32 %i
-  %var1 = load i32, i32* %var0, align 4
+  %var0 = getelementptr inbounds i32, ptr %a, i32 %i
+  %var1 = load i32, ptr %var0, align 4
   br i1 %c, label %if.then, label %if.end
 
 if.then:
@@ -2523,7 +2471,7 @@ for.end:
 ;
 
 %pair.i16 = type { i16, i16 }
-define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
+define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; CHECK-LABEL: @iv_vector_and_scalar_users(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
@@ -2544,12 +2492,12 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
-; CHECK-NEXT:    store i16 [[TMP9]], i16* [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
-; CHECK-NEXT:    store i16 [[TMP10]], i16* [[TMP8]], align 2
+; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2565,8 +2513,8 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[I]] to i32
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[A]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[I]], i32 1
-; CHECK-NEXT:    store i16 [[TMP14]], i16* [[TMP15]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1
+; CHECK-NEXT:    store i16 [[TMP14]], ptr [[TMP15]], align 2
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[I_NEXT]] to i32
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP16]], [[N]]
@@ -2592,12 +2540,12 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; IND-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
 ; IND-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; IND-NEXT:    [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16>
-; IND-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1
-; IND-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP3]], i32 1
+; IND-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
+; IND-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1
 ; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0
-; IND-NEXT:    store i16 [[TMP8]], i16* [[TMP6]], align 2
+; IND-NEXT:    store i16 [[TMP8]], ptr [[TMP6]], align 2
 ; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
-; IND-NEXT:    store i16 [[TMP9]], i16* [[TMP7]], align 2
+; IND-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; IND-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2613,8 +2561,8 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; IND-NEXT:    [[TMP11:%.*]] = trunc i64 [[I]] to i32
 ; IND-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], [[A]]
 ; IND-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
-; IND-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[I]], i32 1
-; IND-NEXT:    store i16 [[TMP13]], i16* [[TMP14]], align 2
+; IND-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1
+; IND-NEXT:    store i16 [[TMP13]], ptr [[TMP14]], align 2
 ; IND-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; IND-NEXT:    [[TMP15:%.*]] = trunc i64 [[I_NEXT]] to i32
 ; IND-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP15]], [[N]]
@@ -2647,18 +2595,18 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[VEC_IND]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16>
 ; UNROLL-NEXT:    [[TMP11:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i16>
-; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1
-; UNROLL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP3]], i32 1
-; UNROLL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP4]], i32 1
-; UNROLL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP5]], i32 1
+; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
+; UNROLL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1
+; UNROLL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
+; UNROLL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1
 ; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP10]], i64 0
-; UNROLL-NEXT:    store i16 [[TMP16]], i16* [[TMP12]], align 2
+; UNROLL-NEXT:    store i16 [[TMP16]], ptr [[TMP12]], align 2
 ; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i64 1
-; UNROLL-NEXT:    store i16 [[TMP17]], i16* [[TMP13]], align 2
+; UNROLL-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
 ; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP11]], i64 0
-; UNROLL-NEXT:    store i16 [[TMP18]], i16* [[TMP14]], align 2
+; UNROLL-NEXT:    store i16 [[TMP18]], ptr [[TMP14]], align 2
 ; UNROLL-NEXT:    [[TMP19:%.*]] = extractelement <2 x i16> [[TMP11]], i64 1
-; UNROLL-NEXT:    store i16 [[TMP19]], i16* [[TMP15]], align 2
+; UNROLL-NEXT:    store i16 [[TMP19]], ptr [[TMP15]], align 2
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
 ; UNROLL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2674,8 +2622,8 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[TMP21:%.*]] = trunc i64 [[I]] to i32
 ; UNROLL-NEXT:    [[TMP22:%.*]] = add i32 [[TMP21]], [[A]]
 ; UNROLL-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
-; UNROLL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[I]], i32 1
-; UNROLL-NEXT:    store i16 [[TMP23]], i16* [[TMP24]], align 2
+; UNROLL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1
+; UNROLL-NEXT:    store i16 [[TMP23]], ptr [[TMP24]], align 2
 ; UNROLL-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; UNROLL-NEXT:    [[TMP25:%.*]] = trunc i64 [[I_NEXT]] to i32
 ; UNROLL-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP25]], [[N]]
@@ -2710,18 +2658,18 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT3]], [[STEP_ADD]]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16>
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[TMP3]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP4]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP5]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP6]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0
-; UNROLL-NO-IC-NEXT:    store i16 [[TMP15]], i16* [[TMP11]], align 2
+; UNROLL-NO-IC-NEXT:    store i16 [[TMP15]], ptr [[TMP11]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1
-; UNROLL-NO-IC-NEXT:    store i16 [[TMP16]], i16* [[TMP12]], align 2
+; UNROLL-NO-IC-NEXT:    store i16 [[TMP16]], ptr [[TMP12]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0
-; UNROLL-NO-IC-NEXT:    store i16 [[TMP17]], i16* [[TMP13]], align 2
+; UNROLL-NO-IC-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1
-; UNROLL-NO-IC-NEXT:    store i16 [[TMP18]], i16* [[TMP14]], align 2
+; UNROLL-NO-IC-NEXT:    store i16 [[TMP18]], ptr [[TMP14]], align 2
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2737,8 +2685,8 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = trunc i64 [[I]] to i32
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = add i32 [[A]], [[TMP20]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = trunc i32 [[TMP21]] to i16
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[I]], i32 1
-; UNROLL-NO-IC-NEXT:    store i16 [[TMP22]], i16* [[TMP23]], align 2
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1
+; UNROLL-NO-IC-NEXT:    store i16 [[TMP22]], ptr [[TMP23]], align 2
 ; UNROLL-NO-IC-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = trunc i64 [[I_NEXT]] to i32
 ; UNROLL-NO-IC-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP24]], [[N]]
@@ -2775,30 +2723,30 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[VEC_IND]]
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16>
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16>
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1
-; INTERLEAVE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP3]], i32 1
-; INTERLEAVE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP4]], i32 1
-; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP5]], i32 1
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP6]], i32 1
-; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP7]], i32 1
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP8]], i32 1
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP9]], i32 1
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 1
+; INTERLEAVE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP3]], i32 1
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
+; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1
+; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1
+; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP7]], i32 1
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP8]], i32 1
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP9]], i32 1
 ; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP14]], i64 0
-; INTERLEAVE-NEXT:    store i16 [[TMP24]], i16* [[TMP16]], align 2
+; INTERLEAVE-NEXT:    store i16 [[TMP24]], ptr [[TMP16]], align 2
 ; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP14]], i64 1
-; INTERLEAVE-NEXT:    store i16 [[TMP25]], i16* [[TMP17]], align 2
+; INTERLEAVE-NEXT:    store i16 [[TMP25]], ptr [[TMP17]], align 2
 ; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP14]], i64 2
-; INTERLEAVE-NEXT:    store i16 [[TMP26]], i16* [[TMP18]], align 2
+; INTERLEAVE-NEXT:    store i16 [[TMP26]], ptr [[TMP18]], align 2
 ; INTERLEAVE-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP14]], i64 3
-; INTERLEAVE-NEXT:    store i16 [[TMP27]], i16* [[TMP19]], align 2
+; INTERLEAVE-NEXT:    store i16 [[TMP27]], ptr [[TMP19]], align 2
 ; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP15]], i64 0
-; INTERLEAVE-NEXT:    store i16 [[TMP28]], i16* [[TMP20]], align 2
+; INTERLEAVE-NEXT:    store i16 [[TMP28]], ptr [[TMP20]], align 2
 ; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[TMP15]], i64 1
-; INTERLEAVE-NEXT:    store i16 [[TMP29]], i16* [[TMP21]], align 2
+; INTERLEAVE-NEXT:    store i16 [[TMP29]], ptr [[TMP21]], align 2
 ; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <4 x i16> [[TMP15]], i64 2
-; INTERLEAVE-NEXT:    store i16 [[TMP30]], i16* [[TMP22]], align 2
+; INTERLEAVE-NEXT:    store i16 [[TMP30]], ptr [[TMP22]], align 2
 ; INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <4 x i16> [[TMP15]], i64 3
-; INTERLEAVE-NEXT:    store i16 [[TMP31]], i16* [[TMP23]], align 2
+; INTERLEAVE-NEXT:    store i16 [[TMP31]], ptr [[TMP23]], align 2
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
 ; INTERLEAVE-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2814,8 +2762,8 @@ define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP33:%.*]] = trunc i64 [[I]] to i32
 ; INTERLEAVE-NEXT:    [[TMP34:%.*]] = add i32 [[TMP33]], [[A]]
 ; INTERLEAVE-NEXT:    [[TMP35:%.*]] = trunc i32 [[TMP34]] to i16
-; INTERLEAVE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[I]], i32 1
-; INTERLEAVE-NEXT:    store i16 [[TMP35]], i16* [[TMP36]], align 2
+; INTERLEAVE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1
+; INTERLEAVE-NEXT:    store i16 [[TMP35]], ptr [[TMP36]], align 2
 ; INTERLEAVE-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; INTERLEAVE-NEXT:    [[TMP37:%.*]] = trunc i64 [[I_NEXT]] to i32
 ; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP37]], [[N]]
@@ -2831,8 +2779,8 @@ for.body:
   %0 = trunc i64 %i to i32
   %1 = add i32 %a, %0
   %2 = trunc i32 %1 to i16
-  %3 = getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %i, i32 1
-  store i16 %2, i16* %3, align 2
+  %3 = getelementptr inbounds %pair.i16, ptr %p, i64 %i, i32 1
+  store i16 %2, ptr %3, align 2
   %i.next = add nuw nsw i64 %i, 1
   %4 = trunc i64 %i.next to i32
   %cond = icmp eq i32 %4, %n
@@ -3279,9 +3227,9 @@ exit:
 define i32 @testoverflowcheck() {
 ; CHECK-LABEL: @testoverflowcheck(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTPR_I:%.*]] = load i8, i8* @e, align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @d, align 4
-; CHECK-NEXT:    [[C_PROMOTED_I:%.*]] = load i32, i32* @c, align 4
+; CHECK-NEXT:    [[DOTPR_I:%.*]] = load i8, ptr @e, align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @d, align 4
+; CHECK-NEXT:    [[C_PROMOTED_I:%.*]] = load i32, ptr @c, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i8 -1, [[DOTPR_I]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
@@ -3290,8 +3238,8 @@ define i32 @testoverflowcheck() {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP3]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP3]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; CHECK-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; CHECK-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> <i32 -1, i32 -1>, i32 [[C_PROMOTED_I]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3324,9 +3272,9 @@ define i32 @testoverflowcheck() {
 ;
 ; IND-LABEL: @testoverflowcheck(
 ; IND-NEXT:  entry:
-; IND-NEXT:    [[DOTPR_I:%.*]] = load i8, i8* @e, align 1
-; IND-NEXT:    [[TMP0:%.*]] = load i32, i32* @d, align 4
-; IND-NEXT:    [[C_PROMOTED_I:%.*]] = load i32, i32* @c, align 4
+; IND-NEXT:    [[DOTPR_I:%.*]] = load i8, ptr @e, align 1
+; IND-NEXT:    [[TMP0:%.*]] = load i32, ptr @d, align 4
+; IND-NEXT:    [[C_PROMOTED_I:%.*]] = load i32, ptr @c, align 4
 ; IND-NEXT:    [[TMP1:%.*]] = xor i8 [[DOTPR_I]], -1
 ; IND-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; IND-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
@@ -3334,8 +3282,8 @@ define i32 @testoverflowcheck() {
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP3]], -2
-; IND-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; IND-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]]
+; IND-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; IND-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]]
 ; IND-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> <i32 poison, i32 -1>, i32 [[C_PROMOTED_I]], i64 0
 ; IND-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
 ; IND-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3366,9 +3314,9 @@ define i32 @testoverflowcheck() {
 ;
 ; UNROLL-LABEL: @testoverflowcheck(
 ; UNROLL-NEXT:  entry:
-; UNROLL-NEXT:    [[DOTPR_I:%.*]] = load i8, i8* @e, align 1
-; UNROLL-NEXT:    [[TMP0:%.*]] = load i32, i32* @d, align 4
-; UNROLL-NEXT:    [[C_PROMOTED_I:%.*]] = load i32, i32* @c, align 4
+; UNROLL-NEXT:    [[DOTPR_I:%.*]] = load i8, ptr @e, align 1
+; UNROLL-NEXT:    [[TMP0:%.*]] = load i32, ptr @d, align 4
+; UNROLL-NEXT:    [[C_PROMOTED_I:%.*]] = load i32, ptr @c, align 4
 ; UNROLL-NEXT:    [[TMP1:%.*]] = xor i8 [[DOTPR_I]], -1
 ; UNROLL-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; UNROLL-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
@@ -3376,8 +3324,8 @@ define i32 @testoverflowcheck() {
 ; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP3]], -4
-; UNROLL-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]]
+; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]]
 ; UNROLL-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> <i32 poison, i32 -1>, i32 [[C_PROMOTED_I]], i64 0
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
@@ -3410,9 +3358,9 @@ define i32 @testoverflowcheck() {
 ;
 ; UNROLL-NO-IC-LABEL: @testoverflowcheck(
 ; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    [[DOTPR_I:%.*]] = load i8, i8* @e, align 1
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = load i32, i32* @d, align 4
-; UNROLL-NO-IC-NEXT:    [[C_PROMOTED_I:%.*]] = load i32, i32* @c, align 4
+; UNROLL-NO-IC-NEXT:    [[DOTPR_I:%.*]] = load i8, ptr @e, align 1
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = load i32, ptr @d, align 4
+; UNROLL-NO-IC-NEXT:    [[C_PROMOTED_I:%.*]] = load i32, ptr @c, align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = sub i8 -1, [[DOTPR_I]]
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
@@ -3421,8 +3369,8 @@ define i32 @testoverflowcheck() {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP3]], 4
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP3]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]]
+; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> <i32 -1, i32 -1>, i32 [[C_PROMOTED_I]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3460,9 +3408,9 @@ define i32 @testoverflowcheck() {
 ;
 ; INTERLEAVE-LABEL: @testoverflowcheck(
 ; INTERLEAVE-NEXT:  entry:
-; INTERLEAVE-NEXT:    [[DOTPR_I:%.*]] = load i8, i8* @e, align 1
-; INTERLEAVE-NEXT:    [[TMP0:%.*]] = load i32, i32* @d, align 4
-; INTERLEAVE-NEXT:    [[C_PROMOTED_I:%.*]] = load i32, i32* @c, align 4
+; INTERLEAVE-NEXT:    [[DOTPR_I:%.*]] = load i8, ptr @e, align 1
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = load i32, ptr @d, align 4
+; INTERLEAVE-NEXT:    [[C_PROMOTED_I:%.*]] = load i32, ptr @c, align 4
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = xor i8 [[DOTPR_I]], -1
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
@@ -3470,8 +3418,8 @@ define i32 @testoverflowcheck() {
 ; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP3]], -8
-; INTERLEAVE-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[CAST_VTC]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 poison, i32 -1, i32 -1, i32 -1>, i32 [[C_PROMOTED_I]], i64 0
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
@@ -3503,9 +3451,9 @@ define i32 @testoverflowcheck() {
 ; INTERLEAVE-NEXT:    ret i32 [[AND_I_LCSSA]]
 ;
 entry:
-  %.pr.i = load i8, i8* @e, align 1
-  %0 = load i32, i32* @d, align 4
-  %c.promoted.i = load i32, i32* @c, align 4
+  %.pr.i = load i8, ptr @e, align 1
+  %0 = load i32, ptr @d, align 4
+  %c.promoted.i = load i32, ptr @c, align 4
   br label %cond.end.i
 
 cond.end.i:
@@ -3525,7 +3473,7 @@ loopexit:
 ; we need to convert the SCEV expression into an AddRecExpr.
 ; The expression gets converted to {zext i8 %t to i32,+,1}.
 
-define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
+define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; CHECK-LABEL: @wrappingindvars1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ST:%.*]] = zext i8 [[T:%.*]] to i16
@@ -3552,8 +3500,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; CHECK-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[CAST_VTC]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; CHECK-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]]
 ; CHECK-NEXT:    [[IND_END2:%.*]] = add i32 [[EXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3562,17 +3510,16 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP16]], align 4
+; CHECK-NEXT:    [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i8 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -3585,8 +3532,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[IDX]]
-; CHECK-NEXT:    store i32 [[SPHI]], i32* [[PTR]], align 4
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]]
+; CHECK-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; CHECK-NEXT:    [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32
 ; CHECK-NEXT:    [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1
@@ -3619,8 +3566,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; IND-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -2
-; IND-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; IND-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]]
+; IND-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; IND-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
 ; IND-NEXT:    [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]]
 ; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
 ; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3629,16 +3576,15 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8
-; IND-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]]
-; IND-NEXT:    [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64
-; IND-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]]
-; IND-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP13]], align 4
+; IND-NEXT:    [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8
+; IND-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST4]], [[T]]
+; IND-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
+; IND-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; IND-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; IND-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -3651,9 +3597,9 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; IND-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; IND-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; IND-NEXT:    [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; IND-NEXT:    [[TMP15:%.*]] = sext i8 [[IDX]] to i64
-; IND-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
-; IND-NEXT:    store i32 [[SPHI]], i32* [[PTR]], align 4
+; IND-NEXT:    [[TMP13:%.*]] = sext i8 [[IDX]] to i64
+; IND-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
+; IND-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; IND-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; IND-NEXT:    [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32
 ; IND-NEXT:    [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1
@@ -3686,8 +3632,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -4
-; UNROLL-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]]
+; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
 ; UNROLL-NEXT:    [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]]
 ; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
 ; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3697,19 +3643,17 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; UNROLL-NEXT:    [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8
-; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]]
-; UNROLL-NEXT:    [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64
-; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]]
-; UNROLL-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP13]], align 4
-; UNROLL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2
-; UNROLL-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP15]], align 4
+; UNROLL-NEXT:    [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8
+; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
+; UNROLL-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
+; UNROLL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
+; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
-; UNROLL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; UNROLL-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -3722,9 +3666,9 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[TMP17:%.*]] = sext i8 [[IDX]] to i64
-; UNROLL-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; UNROLL-NEXT:    store i32 [[SPHI]], i32* [[PTR]], align 4
+; UNROLL-NEXT:    [[TMP14:%.*]] = sext i8 [[IDX]] to i64
+; UNROLL-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; UNROLL-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; UNROLL-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; UNROLL-NEXT:    [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32
 ; UNROLL-NEXT:    [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1
@@ -3761,8 +3705,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[CAST_VTC]]
+; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]]
 ; UNROLL-NO-IC-NEXT:    [[IND_END2:%.*]] = add i32 [[EXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3772,22 +3716,20 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
-; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP12]]
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP13]]
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[TMP14]]
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP18]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP20]], align 4
+; UNROLL-NO-IC-NEXT:    [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8
+; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]]
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = add i8 [[OFFSET_IDX]], 0
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP12]]
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP13]]
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP16]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP17]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -3800,8 +3742,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NO-IC-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NO-IC-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; UNROLL-NO-IC-NEXT:    [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[IDX]]
-; UNROLL-NO-IC-NEXT:    store i32 [[SPHI]], i32* [[PTR]], align 4
+; UNROLL-NO-IC-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]]
+; UNROLL-NO-IC-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; UNROLL-NO-IC-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; UNROLL-NO-IC-NEXT:    [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32
 ; UNROLL-NO-IC-NEXT:    [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1
@@ -3834,8 +3776,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; INTERLEAVE-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -8
-; INTERLEAVE-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
 ; INTERLEAVE-NEXT:    [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]]
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0
 ; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
@@ -3845,19 +3787,17 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8
-; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]]
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]]
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP13]], align 4
-; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4
-; INTERLEAVE-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP15]], align 4
+; INTERLEAVE-NEXT:    [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -3870,9 +3810,9 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; INTERLEAVE-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[TMP17:%.*]] = sext i8 [[IDX]] to i64
-; INTERLEAVE-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; INTERLEAVE-NEXT:    store i32 [[SPHI]], i32* [[PTR]], align 4
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = sext i8 [[IDX]] to i64
+; INTERLEAVE-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; INTERLEAVE-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; INTERLEAVE-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; INTERLEAVE-NEXT:    [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32
 ; INTERLEAVE-NEXT:    [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1
@@ -3895,8 +3835,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
   %idx.b = phi i32 [ 0, %entry ], [ %idx.b.inc, %loop ]
   %sphi = phi i32 [ %ext, %entry ], [%idx.inc.ext, %loop]
 
-  %ptr = getelementptr inbounds i32, i32* %A, i8 %idx
-  store i32 %sphi, i32* %ptr
+  %ptr = getelementptr inbounds i32, ptr %A, i8 %idx
+  store i32 %sphi, ptr %ptr
 
   %idx.inc = add i8 %idx, 1
   %idx.inc.ext = zext i8 %idx.inc to i32
@@ -3913,7 +3853,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
 ; In order to recognize %sphi as an induction PHI and vectorize this loop,
 ; we need to convert the SCEV expression into an AddRecExpr.
 ; The expression gets converted to ({4 * (zext %t to i32),+,4}).
-define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
+define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; CHECK-LABEL: @wrappingindvars2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ST:%.*]] = zext i8 [[T:%.*]] to i16
@@ -3941,8 +3881,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; CHECK-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[CAST_VTC]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; CHECK-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[IND_END1:%.*]] = add i32 [[EXT_MUL]], [[TMP12]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0
@@ -3952,17 +3892,16 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP17]], align 4
+; CHECK-NEXT:    [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -3975,8 +3914,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[IDX]]
-; CHECK-NEXT:    store i32 [[SPHI]], i32* [[PTR]], align 4
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]]
+; CHECK-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; CHECK-NEXT:    [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32
 ; CHECK-NEXT:    [[MUL]] = mul i32 [[IDX_INC_EXT]], 4
@@ -4011,10 +3950,10 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; IND-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -2
-; IND-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; IND-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]]
-; IND-NEXT:    [[EXT_MUL4:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; IND-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL4]], 2
+; IND-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; IND-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
+; IND-NEXT:    [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]]
+; IND-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL5]], 2
 ; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
@@ -4022,16 +3961,15 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8
-; IND-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]]
-; IND-NEXT:    [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64
-; IND-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]]
-; IND-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP13]], align 4
+; IND-NEXT:    [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8
+; IND-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST4]], [[T]]
+; IND-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
+; IND-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
-; IND-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; IND-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -4044,9 +3982,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; IND-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; IND-NEXT:    [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; IND-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; IND-NEXT:    [[TMP15:%.*]] = sext i8 [[IDX]] to i64
-; IND-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
-; IND-NEXT:    store i32 [[SPHI]], i32* [[PTR]], align 4
+; IND-NEXT:    [[TMP13:%.*]] = sext i8 [[IDX]] to i64
+; IND-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
+; IND-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; IND-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; IND-NEXT:    [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32
 ; IND-NEXT:    [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2
@@ -4081,10 +4019,10 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -4
-; UNROLL-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]]
-; UNROLL-NEXT:    [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; UNROLL-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL5]], 2
+; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
+; UNROLL-NEXT:    [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]]
+; UNROLL-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2
 ; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
@@ -4093,19 +4031,17 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
-; UNROLL-NEXT:    [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8
-; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]]
-; UNROLL-NEXT:    [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64
-; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]]
-; UNROLL-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP13]], align 4
-; UNROLL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2
-; UNROLL-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP15]], align 4
+; UNROLL-NEXT:    [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8
+; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
+; UNROLL-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
+; UNROLL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
+; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 16, i32 16>
-; UNROLL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -4118,9 +4054,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[TMP17:%.*]] = sext i8 [[IDX]] to i64
-; UNROLL-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; UNROLL-NEXT:    store i32 [[SPHI]], i32* [[PTR]], align 4
+; UNROLL-NEXT:    [[TMP14:%.*]] = sext i8 [[IDX]] to i64
+; UNROLL-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; UNROLL-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; UNROLL-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; UNROLL-NEXT:    [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32
 ; UNROLL-NEXT:    [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2
@@ -4159,8 +4095,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[CAST_VTC]]
+; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
 ; UNROLL-NO-IC-NEXT:    [[IND_END1:%.*]] = add i32 [[EXT_MUL]], [[TMP12]]
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0
@@ -4171,22 +4107,20 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
-; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP13]]
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 0
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = add i8 [[OFFSET_IDX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP14]]
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[TMP15]]
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP19]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP21]], align 4
+; UNROLL-NO-IC-NEXT:    [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8
+; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]]
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP13]]
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP14]]
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP17]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP18]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 8, i32 8>
-; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -4199,8 +4133,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; UNROLL-NO-IC-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NO-IC-NEXT:    [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; UNROLL-NO-IC-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[IDX]]
-; UNROLL-NO-IC-NEXT:    store i32 [[SPHI]], i32* [[PTR]], align 4
+; UNROLL-NO-IC-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]]
+; UNROLL-NO-IC-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; UNROLL-NO-IC-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; UNROLL-NO-IC-NEXT:    [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32
 ; UNROLL-NO-IC-NEXT:    [[MUL]] = mul i32 [[IDX_INC_EXT]], 4
@@ -4235,10 +4169,10 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; INTERLEAVE-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -8
-; INTERLEAVE-NEXT:    [[CAST_VTC:%.*]] = trunc i32 [[N_VEC]] to i8
-; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[CAST_VTC]], [[T]]
-; INTERLEAVE-NEXT:    [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; INTERLEAVE-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL5]], 2
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
+; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
+; INTERLEAVE-NEXT:    [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]]
+; INTERLEAVE-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 4, i32 8, i32 12>
@@ -4247,19 +4181,17 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 16, i32 16, i32 16, i32 16>
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = trunc i32 [[INDEX]] to i8
-; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[TMP10]], [[T]]
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP11]]
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP13]], align 4
-; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4
-; INTERLEAVE-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP15]], align 4
+; INTERLEAVE-NEXT:    [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 32, i32 32, i32 32, i32 32>
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -4272,9 +4204,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
 ; INTERLEAVE-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[TMP17:%.*]] = sext i8 [[IDX]] to i64
-; INTERLEAVE-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; INTERLEAVE-NEXT:    store i32 [[SPHI]], i32* [[PTR]], align 4
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = sext i8 [[IDX]] to i64
+; INTERLEAVE-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; INTERLEAVE-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; INTERLEAVE-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; INTERLEAVE-NEXT:    [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32
 ; INTERLEAVE-NEXT:    [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2
@@ -4300,8 +4232,8 @@ entry:
   %sphi = phi i32 [ %ext.mul, %entry ], [%mul, %loop]
   %idx.b = phi i32 [ 0, %entry ], [ %idx.b.inc, %loop ]
 
-  %ptr = getelementptr inbounds i32, i32* %A, i8 %idx
-  store i32 %sphi, i32* %ptr
+  %ptr = getelementptr inbounds i32, ptr %A, i8 %idx
+  store i32 %sphi, ptr %ptr
 
   %idx.inc = add i8 %idx, 1
   %idx.inc.ext = zext i8 %idx.inc to i32
@@ -4318,7 +4250,7 @@ entry:
 ; Check that we generate vectorized IVs in the pre-header
 ; instead of widening the scalar IV inside the loop, when
 ; we know how to do that.
-define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
+define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; CHECK-LABEL: @veciv(
 ; CHECK-NEXT:  for.body.preheader:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K:%.*]], 2
@@ -4331,14 +4263,13 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4347,8 +4278,8 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
@@ -4366,13 +4297,12 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
-; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; IND-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP2]], align 4
+; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; IND-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; IND-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[K]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4381,9 +4311,9 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; IND-NEXT:    br label [[FOR_BODY:%.*]]
 ; IND:       for.body:
 ; IND-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; IND-NEXT:    [[TMP4:%.*]] = sext i32 [[INDVARS_IV]] to i64
-; IND-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; IND-NEXT:    store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4
+; IND-NEXT:    [[TMP3:%.*]] = sext i32 [[INDVARS_IV]] to i64
+; IND-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
+; IND-NEXT:    store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4
 ; IND-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
 ; IND-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]]
 ; IND-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
@@ -4402,16 +4332,14 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
-; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; UNROLL-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP2]], align 4
-; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 2
-; UNROLL-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP4]], align 4
+; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
-; UNROLL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; UNROLL-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[K]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4420,9 +4348,9 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; UNROLL-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[TMP6:%.*]] = sext i32 [[INDVARS_IV]] to i64
-; UNROLL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
-; UNROLL-NEXT:    store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4
+; UNROLL-NEXT:    [[TMP4:%.*]] = sext i32 [[INDVARS_IV]] to i64
+; UNROLL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
+; UNROLL-NEXT:    store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4
 ; UNROLL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
 ; UNROLL-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]]
 ; UNROLL-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
@@ -4443,18 +4371,16 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP5]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP7]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4463,8 +4389,8 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
-; UNROLL-NO-IC-NEXT:    store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4
+; UNROLL-NO-IC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDVARS_IV]]
+; UNROLL-NO-IC-NEXT:    store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
@@ -4483,16 +4409,14 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP2]], align 4
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP4]], align 4
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[K]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4501,9 +4425,9 @@ define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
 ; INTERLEAVE-NEXT:    br label [[FOR_BODY:%.*]]
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = sext i32 [[INDVARS_IV]] to i64
-; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
-; INTERLEAVE-NEXT:    store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = sext i32 [[INDVARS_IV]] to i64
+; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
+; INTERLEAVE-NEXT:    store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4
 ; INTERLEAVE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
 ; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]]
 ; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
@@ -4515,8 +4439,8 @@ for.body.preheader:
 
 for.body:
   %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
-  store i32 %indvars.iv, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %a, i32 %indvars.iv
+  store i32 %indvars.iv, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
   %exitcond = icmp eq i32 %indvars.iv.next, %k
   br i1 %exitcond, label %exit, label %for.body
@@ -4525,7 +4449,7 @@ exit:
   ret void
 }
 
-define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
+define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; CHECK-LABEL: @trunciv(
 ; CHECK-NEXT:  for.body.preheader:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[K:%.*]], 2
@@ -4542,18 +4466,17 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[K]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP9]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4563,8 +4486,8 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TRUNC_IV]]
-; CHECK-NEXT:    store i32 [[TRUNC_IV]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TRUNC_IV]]
+; CHECK-NEXT:    store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[K]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
@@ -4582,17 +4505,16 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; IND-NEXT:    [[N_VEC:%.*]] = and i64 [[K]], -2
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
-; IND-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
+; IND-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
 ; IND-NEXT:    [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32
-; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; IND-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP2]], align 4
-; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
+; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; IND-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; IND-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[K]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4603,9 +4525,9 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; IND-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; IND-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; IND-NEXT:    [[SEXT1:%.*]] = shl i64 [[INDVARS_IV]], 32
-; IND-NEXT:    [[TMP4:%.*]] = ashr exact i64 [[SEXT1]], 32
-; IND-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; IND-NEXT:    store i32 [[TRUNC_IV]], i32* [[ARRAYIDX]], align 4
+; IND-NEXT:    [[TMP3:%.*]] = ashr exact i64 [[SEXT1]], 32
+; IND-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
+; IND-NEXT:    store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4
 ; IND-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; IND-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[K]]
 ; IND-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
@@ -4623,21 +4545,19 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[K]], -4
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; UNROLL-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
+; UNROLL-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
 ; UNROLL-NEXT:    [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32
-; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; UNROLL-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP2]], align 4
-; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 2
-; UNROLL-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP4]], align 4
-; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
+; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
-; UNROLL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; UNROLL-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[K]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4648,9 +4568,9 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; UNROLL-NEXT:    [[SEXT2:%.*]] = shl i64 [[INDVARS_IV]], 32
-; UNROLL-NEXT:    [[TMP6:%.*]] = ashr exact i64 [[SEXT2]], 32
-; UNROLL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
-; UNROLL-NEXT:    store i32 [[TRUNC_IV]], i32* [[ARRAYIDX]], align 4
+; UNROLL-NEXT:    [[TMP4:%.*]] = ashr exact i64 [[SEXT2]], 32
+; UNROLL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
+; UNROLL-NEXT:    store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4
 ; UNROLL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; UNROLL-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[K]]
 ; UNROLL-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
@@ -4673,24 +4593,22 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[K]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i32 [[TMP5]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP7]]
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP11]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP13]], align 4
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]]
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP7]]
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP11]], align 4
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4700,8 +4618,8 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NO-IC-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; UNROLL-NO-IC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TRUNC_IV]]
-; UNROLL-NO-IC-NEXT:    store i32 [[TRUNC_IV]], i32* [[ARRAYIDX]], align 4
+; UNROLL-NO-IC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TRUNC_IV]]
+; UNROLL-NO-IC-NEXT:    store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[K]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
@@ -4719,21 +4637,19 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[K]], -8
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
-; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; INTERLEAVE-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
+; INTERLEAVE-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP2]], align 4
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP4]], align 4
-; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[K]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4744,9 +4660,9 @@ define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
 ; INTERLEAVE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; INTERLEAVE-NEXT:    [[SEXT2:%.*]] = shl i64 [[INDVARS_IV]], 32
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = ashr exact i64 [[SEXT2]], 32
-; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
-; INTERLEAVE-NEXT:    store i32 [[TRUNC_IV]], i32* [[ARRAYIDX]], align 4
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = ashr exact i64 [[SEXT2]], 32
+; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
+; INTERLEAVE-NEXT:    store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4
 ; INTERLEAVE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[K]]
 ; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
@@ -4759,8 +4675,8 @@ for.body.preheader:
 for.body:
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
   %trunc.iv = trunc i64 %indvars.iv to i32
-  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %trunc.iv
-  store i32 %trunc.iv, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %a, i32 %trunc.iv
+  store i32 %trunc.iv, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, %k
   br i1 %exitcond, label %exit, label %for.body
@@ -4771,7 +4687,7 @@ exit:
 
 ;
 ;
-define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
+define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; CHECK-LABEL: @nonprimary(
 ; CHECK-NEXT:  for.body.preheader:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 [[K:%.*]], [[I:%.*]]
@@ -4790,14 +4706,13 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4806,8 +4721,8 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
@@ -4831,13 +4746,12 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]]
 ; IND-NEXT:    [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64
-; IND-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
-; IND-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4
+; IND-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
+; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; IND-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; IND-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4846,9 +4760,9 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; IND-NEXT:    br label [[FOR_BODY:%.*]]
 ; IND:       for.body:
 ; IND-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; IND-NEXT:    [[TMP5:%.*]] = sext i32 [[INDVARS_IV]] to i64
-; IND-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
-; IND-NEXT:    store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4
+; IND-NEXT:    [[TMP4:%.*]] = sext i32 [[INDVARS_IV]] to i64
+; IND-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
+; IND-NEXT:    store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4
 ; IND-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
 ; IND-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]]
 ; IND-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
@@ -4873,16 +4787,14 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]]
 ; UNROLL-NEXT:    [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4
-; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 2
-; UNROLL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP5]], align 4
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
+; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
-; UNROLL-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; UNROLL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4891,9 +4803,9 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[TMP7:%.*]] = sext i32 [[INDVARS_IV]] to i64
-; UNROLL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
-; UNROLL-NEXT:    store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4
+; UNROLL-NEXT:    [[TMP5:%.*]] = sext i32 [[INDVARS_IV]] to i64
+; UNROLL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
+; UNROLL-NEXT:    store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4
 ; UNROLL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
 ; UNROLL-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]]
 ; UNROLL-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
@@ -4920,18 +4832,16 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]]
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]]
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP8]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP2]]
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP6]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4940,8 +4850,8 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
-; UNROLL-NO-IC-NEXT:    store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4
+; UNROLL-NO-IC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDVARS_IV]]
+; UNROLL-NO-IC-NEXT:    store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
@@ -4966,16 +4876,14 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]]
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP3]], align 4
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP5]], align 4
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -4984,9 +4892,9 @@ define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; INTERLEAVE-NEXT:    br label [[FOR_BODY:%.*]]
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = sext i32 [[INDVARS_IV]] to i64
-; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
-; INTERLEAVE-NEXT:    store i32 [[INDVARS_IV]], i32* [[ARRAYIDX]], align 4
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = sext i32 [[INDVARS_IV]] to i64
+; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
+; INTERLEAVE-NEXT:    store i32 [[INDVARS_IV]], ptr [[ARRAYIDX]], align 4
 ; INTERLEAVE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
 ; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[K]]
 ; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
@@ -4998,8 +4906,8 @@ for.body.preheader:
 
 for.body:
   %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ %i, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
-  store i32 %indvars.iv, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %a, i32 %indvars.iv
+  store i32 %indvars.iv, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
   %exitcond = icmp eq i32 %indvars.iv.next, %k
   br i1 %exitcond, label %exit, label %for.body
@@ -5008,7 +4916,7 @@ exit:
   ret void
 }
 
-define void @non_primary_iv_trunc(i32* %a, i64 %n) {
+define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; CHECK-LABEL: @non_primary_iv_trunc(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
@@ -5023,14 +4931,13 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -5041,9 +4948,9 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
 ; CHECK-NEXT:    [[VAR1:%.*]] = trunc i64 [[J]] to i32
-; CHECK-NEXT:    store i32 [[VAR1]], i32* [[VAR0]], align 4
+; CHECK-NEXT:    store i32 [[VAR1]], ptr [[VAR0]], align 4
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 2
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
@@ -5063,13 +4970,12 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) {
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; IND-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP1]], align 4
+; IND-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP0]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
-; IND-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
+; IND-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -5080,9 +4986,9 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) {
 ; IND:       for.body:
 ; IND-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; IND-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; IND-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; IND-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
 ; IND-NEXT:    [[VAR1:%.*]] = trunc i64 [[J]] to i32
-; IND-NEXT:    store i32 [[VAR1]], i32* [[VAR0]], align 4
+; IND-NEXT:    store i32 [[VAR1]], ptr [[VAR0]], align 4
 ; IND-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; IND-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 2
 ; IND-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
@@ -5103,16 +5009,14 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) {
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
-; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP1]], align 4
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; UNROLL-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP3]], align 4
+; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP0]], align 4
+; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
-; UNROLL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
+; UNROLL-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -5123,9 +5027,9 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) {
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; UNROLL-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
 ; UNROLL-NEXT:    [[VAR1:%.*]] = trunc i64 [[J]] to i32
-; UNROLL-NEXT:    store i32 [[VAR1]], i32* [[VAR0]], align 4
+; UNROLL-NEXT:    store i32 [[VAR1]], ptr [[VAR0]], align 4
 ; UNROLL-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; UNROLL-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 2
 ; UNROLL-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
@@ -5149,18 +5053,16 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP5]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP7]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 4, i32 4>
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -5171,9 +5073,9 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) {
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NO-IC-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; UNROLL-NO-IC-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
 ; UNROLL-NO-IC-NEXT:    [[VAR1:%.*]] = trunc i64 [[J]] to i32
-; UNROLL-NO-IC-NEXT:    store i32 [[VAR1]], i32* [[VAR0]], align 4
+; UNROLL-NO-IC-NEXT:    store i32 [[VAR1]], ptr [[VAR0]], align 4
 ; UNROLL-NO-IC-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; UNROLL-NO-IC-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 2
 ; UNROLL-NO-IC-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
@@ -5194,16 +5096,14 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) {
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP1]], align 4
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 16, i32 16, i32 16, i32 16>
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -5214,9 +5114,9 @@ define void @non_primary_iv_trunc(i32* %a, i64 %n) {
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; INTERLEAVE-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
 ; INTERLEAVE-NEXT:    [[VAR1:%.*]] = trunc i64 [[J]] to i32
-; INTERLEAVE-NEXT:    store i32 [[VAR1]], i32* [[VAR0]], align 4
+; INTERLEAVE-NEXT:    store i32 [[VAR1]], ptr [[VAR0]], align 4
 ; INTERLEAVE-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; INTERLEAVE-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 2
 ; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
@@ -5230,9 +5130,9 @@ entry:
 for.body:
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
   %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ]
-  %var0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %var0 = getelementptr inbounds i32, ptr %a, i64 %i
   %var1 = trunc i64 %j to i32
-  store i32 %var1, i32* %var0, align 4
+  store i32 %var1, ptr %var0, align 4
   %i.next = add nuw nsw i64 %i, 1
   %j.next = add nuw nsw i64 %j, 2
   %cond = icmp slt i64 %i.next, %n
@@ -6103,7 +6003,7 @@ loop:                                         ; preds = %loop, %entry
 }
 
 ; Test case for PR52460.
-define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32* %dst) {
+define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %dst) {
 ;
 ; CHECK-LABEL: @pr52460_first_order_recurrence_truncated_iv(
 ; CHECK-NEXT:  entry:
@@ -6111,25 +6011,24 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32*
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SRC:%.*]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32>* [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 100
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1
@@ -6144,14 +6043,14 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32*
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TRUNC_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC]], align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[LV]], [[SCALAR_RECUR]]
 ; CHECK-NEXT:    [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[IV_TRUNC]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST]], i32 [[IV_TRUNC]]
+; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IV_TRUNC]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IV_TRUNC]], [[MUL]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[DST_GEP]], align 4
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[DST_GEP]], align 4
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TRUNC_IV_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP53:![0-9]+]]
 ; CHECK:       exit:
@@ -6163,24 +6062,23 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32*
 ; IND:       vector.ph:
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
-; IND-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VEC_IND]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
-; IND-NEXT:    [[TMP1:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; IND-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC:%.*]], align 4
 ; IND-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0
 ; IND-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[TMP2:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP0]]
-; IND-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
+; IND-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
 ; IND-NEXT:    [[TMP3:%.*]] = ashr exact i64 [[SEXT]], 32
-; IND-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP3]]
+; IND-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP3]]
 ; IND-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP2]]
-; IND-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4
-; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; IND-NEXT:    store <2 x i32> [[TMP5]], ptr [[TMP4]], align 4
+; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; IND-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; IND-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
+; IND-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; IND-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IND:       scalar.ph:
@@ -6196,31 +6094,29 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32*
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
-; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
 ; UNROLL-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
-; UNROLL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; UNROLL-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SRC:%.*]], align 4
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
 ; UNROLL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[TMP3:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP0]]
 ; UNROLL-NEXT:    [[TMP4:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP1]]
-; UNROLL-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
+; UNROLL-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
 ; UNROLL-NEXT:    [[TMP5:%.*]] = ashr exact i64 [[SEXT]], 32
-; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP5]]
+; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]]
 ; UNROLL-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP3]]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP4]]
-; UNROLL-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[TMP7]], <2 x i32>* [[TMP9]], align 4
-; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP6]], i64 2
-; UNROLL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[TMP8]], <2 x i32>* [[TMP11]], align 4
-; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; UNROLL-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP6]], align 4
+; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP6]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
-; UNROLL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; UNROLL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
+; UNROLL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; UNROLL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL:       scalar.ph:
@@ -6236,34 +6132,32 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32*
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = load i32, ptr [[SRC:%.*]], align 4
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP4]]
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[DST]], i32 [[TMP2]]
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP7]]
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP8]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP10]], <2 x i32>* [[TMP13]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP8]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP11]], <2 x i32>* [[TMP15]], align 4
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP10]], ptr [[TMP12]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 100
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i32 1
@@ -6278,14 +6172,14 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32*
 ; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; UNROLL-NO-IC-NEXT:    [[TRUNC_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[LV:%.*]] = load i32, i32* [[SRC]], align 4
+; UNROLL-NO-IC-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-IC-NEXT:    [[MUL:%.*]] = mul nsw i32 [[LV]], [[SCALAR_RECUR]]
 ; UNROLL-NO-IC-NEXT:    [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1
 ; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; UNROLL-NO-IC-NEXT:    [[IV_TRUNC]] = trunc i64 [[IV]] to i32
-; UNROLL-NO-IC-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST]], i32 [[IV_TRUNC]]
+; UNROLL-NO-IC-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IV_TRUNC]]
 ; UNROLL-NO-IC-NEXT:    [[ADD:%.*]] = add i32 [[IV_TRUNC]], [[MUL]]
-; UNROLL-NO-IC-NEXT:    store i32 [[ADD]], i32* [[DST_GEP]], align 4
+; UNROLL-NO-IC-NEXT:    store i32 [[ADD]], ptr [[DST_GEP]], align 4
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TRUNC_IV_NEXT]], 100
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP53:![0-9]+]]
 ; UNROLL-NO-IC:       exit:
@@ -6297,31 +6191,29 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32*
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
-; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SRC:%.*]], align 4
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i32> [[BROADCAST_SPLAT4]], [[TMP0]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[BROADCAST_SPLAT4]], [[TMP1]]
-; INTERLEAVE-NEXT:    [[SEXT:%.*]] = shl i64 [[INDEX]], 32
+; INTERLEAVE-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = ashr exact i64 [[SEXT]], 32
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP5]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]]
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[VEC_IND]], [[TMP3]]
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[STEP_ADD]], [[TMP4]]
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP9]], align 4
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP6]], i64 4
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4
-; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; INTERLEAVE-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP6]], align 4
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP6]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
-; INTERLEAVE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; INTERLEAVE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i64 3
 ; INTERLEAVE-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -6334,16 +6226,16 @@ define void @pr52460_first_order_recurrence_truncated_iv(i32* noalias %src, i32*
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; INTERLEAVE-NEXT:    [[TRUNC_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ]
 ; INTERLEAVE-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
-; INTERLEAVE-NEXT:    [[LV:%.*]] = load i32, i32* [[SRC]], align 4
+; INTERLEAVE-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC]], align 4
 ; INTERLEAVE-NEXT:    [[MUL:%.*]] = mul nsw i32 [[LV]], [[SCALAR_RECUR]]
 ; INTERLEAVE-NEXT:    [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1
 ; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; INTERLEAVE-NEXT:    [[IV_TRUNC]] = trunc i64 [[IV]] to i32
 ; INTERLEAVE-NEXT:    [[SEXT5:%.*]] = shl i64 [[IV]], 32
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = ashr exact i64 [[SEXT5]], 32
-; INTERLEAVE-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP13]]
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = ashr exact i64 [[SEXT5]], 32
+; INTERLEAVE-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP11]]
 ; INTERLEAVE-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[IV_TRUNC]]
-; INTERLEAVE-NEXT:    store i32 [[ADD]], i32* [[DST_GEP]], align 4
+; INTERLEAVE-NEXT:    store i32 [[ADD]], ptr [[DST_GEP]], align 4
 ; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TRUNC_IV_NEXT]], 100
 ; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP53:![0-9]+]]
 ; INTERLEAVE:       exit:
@@ -6356,14 +6248,14 @@ loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %trunc.iv = phi i32 [ 0, %entry ], [ %trunc.iv.next, %loop ]
   %recur = phi i32 [ 0, %entry ], [ %iv.trunc, %loop ]
-  %lv = load i32, i32* %src, align 4
+  %lv = load i32, ptr %src, align 4
   %mul = mul nsw i32 %lv, %recur
   %trunc.iv.next  = add i32 %trunc.iv, 1
   %iv.next = add nuw nsw i64 %iv, 1
   %iv.trunc = trunc i64 %iv to i32
-  %dst.gep = getelementptr i32, i32* %dst, i32 %iv.trunc
+  %dst.gep = getelementptr i32, ptr %dst, i32 %iv.trunc
   %add = add i32 %iv.trunc, %mul
-  store i32 %add, i32* %dst.gep
+  store i32 %add, ptr %dst.gep
   %exitcond = icmp eq i32 %trunc.iv.next, 100
   br i1 %exitcond, label %exit, label %loop
 
@@ -6374,7 +6266,7 @@ exit:
 ; Test case where %iv.2.ext and %iv.2.conv become redundant due to the SCEV
 ; predicates generated for the vector loop. They should be removed in the
 ; vector loop.
-define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n, i32 %step, i32* %ptr) {
+define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n, i32 %step, ptr %ptr) {
 ;
 ; CHECK-LABEL: @test_optimized_cast_induction_feeding_first_order_recurrence(
 ; CHECK-NEXT:  entry:
@@ -6406,8 +6298,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_VTC]], [[STEP]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
@@ -6422,14 +6314,13 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; CHECK-NEXT:    [[VEC_IND]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP20]], <2 x i32>* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[TMP20]], ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1
@@ -6446,8 +6337,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_2_EXT:%.*]] = shl i32 [[IV_2]], 24
 ; CHECK-NEXT:    [[IV_2_CONV]] = ashr exact i32 [[IV_2_EXT]], 24
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IV_1]]
-; CHECK-NEXT:    store i32 [[SCALAR_RECUR]], i32* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1]]
+; CHECK-NEXT:    store i32 [[SCALAR_RECUR]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add nsw i32 [[IV_2_CONV]], [[STEP]]
 ; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]]
@@ -6482,8 +6373,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; IND-NEXT:    br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -2
-; IND-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
-; IND-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_VTC]], [[STEP]]
+; IND-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; IND-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
 ; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[TMP16:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -6496,13 +6387,12 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; IND-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[VEC_IND]] = phi <2 x i32> [ [[TMP16]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
-; IND-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[INDEX]]
-; IND-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <2 x i32>*
-; IND-NEXT:    store <2 x i32> [[TMP18]], <2 x i32>* [[TMP20]], align 4
+; IND-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
+; IND-NEXT:    store <2 x i32> [[TMP18]], ptr [[TMP19]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; IND-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
+; IND-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 ; IND-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i64 1
@@ -6518,8 +6408,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; IND-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
 ; IND-NEXT:    [[IV_2_EXT:%.*]] = shl i32 [[IV_2]], 24
 ; IND-NEXT:    [[IV_2_CONV]] = ashr exact i32 [[IV_2_EXT]], 24
-; IND-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IV_1]]
-; IND-NEXT:    store i32 [[SCALAR_RECUR]], i32* [[GEP]], align 4
+; IND-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1]]
+; IND-NEXT:    store i32 [[SCALAR_RECUR]], ptr [[GEP]], align 4
 ; IND-NEXT:    [[IV_2_NEXT]] = add nsw i32 [[IV_2_CONV]], [[STEP]]
 ; IND-NEXT:    [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1
 ; IND-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]]
@@ -6554,8 +6444,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
-; UNROLL-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
-; UNROLL-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_VTC]], [[STEP]]
+; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; UNROLL-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
 ; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[TMP16:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -6570,16 +6460,14 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    [[STEP_ADD]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; UNROLL-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
 ; UNROLL-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
-; UNROLL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4
-; UNROLL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 2
-; UNROLL-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <2 x i32>*
-; UNROLL-NEXT:    store <2 x i32> [[TMP19]], <2 x i32>* [[TMP23]], align 4
+; UNROLL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
+; UNROLL-NEXT:    store <2 x i32> [[TMP18]], ptr [[TMP20]], align 4
+; UNROLL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[TMP19]], ptr [[TMP21]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[DOTSPLAT3]]
-; UNROLL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
+; UNROLL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 ; UNROLL-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i64 1
@@ -6595,8 +6483,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
 ; UNROLL-NEXT:    [[IV_2_EXT:%.*]] = shl i32 [[IV_2]], 24
 ; UNROLL-NEXT:    [[IV_2_CONV]] = ashr exact i32 [[IV_2_EXT]], 24
-; UNROLL-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IV_1]]
-; UNROLL-NEXT:    store i32 [[SCALAR_RECUR]], i32* [[GEP]], align 4
+; UNROLL-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1]]
+; UNROLL-NEXT:    store i32 [[SCALAR_RECUR]], ptr [[GEP]], align 4
 ; UNROLL-NEXT:    [[IV_2_NEXT]] = add nsw i32 [[IV_2_CONV]], [[STEP]]
 ; UNROLL-NEXT:    [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1
 ; UNROLL-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]]
@@ -6634,8 +6522,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_VTC]], [[STEP]]
+; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
@@ -6653,18 +6541,16 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[TMP19]]
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP20]]
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP21]], <2 x i32>* [[TMP26]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = bitcast i32* [[TMP27]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP22]], <2 x i32>* [[TMP28]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP19]]
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[TMP20]]
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP21]], ptr [[TMP25]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP22]], ptr [[TMP26]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[DOTSPLAT3]]
-; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i32 1
@@ -6681,8 +6567,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
 ; UNROLL-NO-IC-NEXT:    [[IV_2_EXT:%.*]] = shl i32 [[IV_2]], 24
 ; UNROLL-NO-IC-NEXT:    [[IV_2_CONV]] = ashr exact i32 [[IV_2_EXT]], 24
-; UNROLL-NO-IC-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IV_1]]
-; UNROLL-NO-IC-NEXT:    store i32 [[SCALAR_RECUR]], i32* [[GEP]], align 4
+; UNROLL-NO-IC-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1]]
+; UNROLL-NO-IC-NEXT:    store i32 [[SCALAR_RECUR]], ptr [[GEP]], align 4
 ; UNROLL-NO-IC-NEXT:    [[IV_2_NEXT]] = add nsw i32 [[IV_2_CONV]], [[STEP]]
 ; UNROLL-NO-IC-NEXT:    [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]]
@@ -6717,8 +6603,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -8
-; INTERLEAVE-NEXT:    [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32
-; INTERLEAVE-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_VTC]], [[STEP]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; INTERLEAVE-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0
 ; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = mul <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
@@ -6733,16 +6619,14 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; INTERLEAVE-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* [[TMP21]], align 4
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 4
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; INTERLEAVE-NEXT:    store <4 x i32> [[TMP19]], <4 x i32>* [[TMP23]], align 4
+; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    store <4 x i32> [[TMP18]], ptr [[TMP20]], align 4
+; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[TMP19]], ptr [[TMP21]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], [[DOTSPLAT3]]
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
 ; INTERLEAVE-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i64 3
@@ -6758,8 +6642,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
 ; INTERLEAVE-NEXT:    [[IV_2_EXT:%.*]] = shl i32 [[IV_2]], 24
 ; INTERLEAVE-NEXT:    [[IV_2_CONV]] = ashr exact i32 [[IV_2_EXT]], 24
-; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IV_1]]
-; INTERLEAVE-NEXT:    store i32 [[SCALAR_RECUR]], i32* [[GEP]], align 4
+; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IV_1]]
+; INTERLEAVE-NEXT:    store i32 [[SCALAR_RECUR]], ptr [[GEP]], align 4
 ; INTERLEAVE-NEXT:    [[IV_2_NEXT]] = add nsw i32 [[IV_2_CONV]], [[STEP]]
 ; INTERLEAVE-NEXT:    [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1
 ; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]]
@@ -6776,8 +6660,8 @@ loop:
   %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
   %iv.2.ext = shl i32 %iv.2, 24
   %iv.2.conv = ashr exact i32 %iv.2.ext, 24
-  %gep = getelementptr inbounds i32, i32* %ptr, i64 %iv.1
-  store i32 %for, i32* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %iv.1
+  store i32 %for, ptr %gep, align 4
   %iv.2.next = add nsw i32 %iv.2.conv, %step
   %iv.1.next = add nuw nsw i64 %iv.1, 1
   %exitcond = icmp eq i64 %iv.1.next, %n

diff  --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
index 70dece3dff4a..86b0ca2b3650 100644
--- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
 
 ; Tests for PR54266.
-define i32 @one_direct_branch(i32* %src) {
+define i32 @one_direct_branch(ptr %src) {
 ; CHECK-LABEL: @one_direct_branch(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -11,16 +11,15 @@ define i32 @one_direct_branch(i32* %src) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> <i32 25500, i32 25500, i32 25500, i32 25500>, [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> <i32 25500, i32 25500, i32 25500, i32 25500>, [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1000, 1000
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -28,8 +27,8 @@ define i32 @one_direct_branch(i32* %src) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[SRC_GEP]], align 4
+; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 25500, [[LV]]
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
@@ -38,7 +37,7 @@ define i32 @one_direct_branch(i32* %src) {
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
 ;
 entry:
@@ -46,8 +45,8 @@ entry:
 
 loop:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %src.gep = getelementptr inbounds i32, i32* %src, i32 %iv
-  %lv = load i32, i32* %src.gep
+  %src.gep = getelementptr inbounds i32, ptr %src, i32 %iv
+  %lv = load i32, ptr %src.gep
   %xor = xor i32 25500, %lv
   br label %loop.latch
 
@@ -62,7 +61,7 @@ exit:
   ret i32 %xor.lcssa
 }
 
-define i32 @two_direct_branch(i32* %src) {
+define i32 @two_direct_branch(ptr %src) {
 ; CHECK-LABEL: @two_direct_branch(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -71,16 +70,15 @@ define i32 @two_direct_branch(i32* %src) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> <i32 25500, i32 25500, i32 25500, i32 25500>, [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> <i32 25500, i32 25500, i32 25500, i32 25500>, [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1000, 1000
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -88,8 +86,8 @@ define i32 @two_direct_branch(i32* %src) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[SRC_GEP]], align 4
+; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 25500, [[LV]]
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
@@ -101,7 +99,7 @@ define i32 @two_direct_branch(i32* %src) {
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
 ;
 entry:
@@ -109,8 +107,8 @@ entry:
 
 loop:                              ; preds = %for.inc3, %entry
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %src.gep = getelementptr inbounds i32, i32* %src, i32 %iv
-  %lv = load i32, i32* %src.gep
+  %src.gep = getelementptr inbounds i32, ptr %src, i32 %iv
+  %lv = load i32, ptr %src.gep
   %xor = xor i32 25500, %lv
   br label %bb
 
@@ -129,7 +127,7 @@ exit:
   ret i32 %xor.lcssa
 }
 
-define i32 @cond_branch(i32 %a, i32* %src) {
+define i32 @cond_branch(i32 %a, ptr %src) {
 ; CHECK-LABEL: @cond_branch(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -141,20 +139,19 @@ define i32 @cond_branch(i32 %a, i32* %src) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> <i32 25500, i32 25500, i32 25500, i32 25500>, [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> <i32 25500, i32 25500, i32 25500, i32 25500>, [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i32> [[TMP3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1000, 1000
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -162,8 +159,8 @@ define i32 @cond_branch(i32 %a, i32* %src) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[SRC_GEP]], align 4
+; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 25500, [[LV]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[IV]], [[A]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[THEN:%.*]]
@@ -175,7 +172,7 @@ define i32 @cond_branch(i32 %a, i32* %src) {
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
 ;
 entry:
@@ -183,8 +180,8 @@ entry:
 
 loop:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %src.gep = getelementptr inbounds i32, i32* %src, i32 %iv
-  %lv = load i32, i32* %src.gep
+  %src.gep = getelementptr inbounds i32, ptr %src, i32 %iv
+  %lv = load i32, ptr %src.gep
   %xor = xor i32 25500, %lv
   %cmp = icmp ne i32 %iv, %a
   br i1 %cmp, label %loop.latch, label %then
@@ -211,13 +208,13 @@ define i32 @optimizable_trunc_used_outside() {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:

diff  --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
index 0fc560c5d129..b3569f387b68 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
@@ -17,54 +17,53 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; Instcombine'd version of @inv_val_store_to_inv_address_conditional_
diff _values.
 ; Now the store is no longer of invariant value.
 ; scalar store the value extracted from the last element of the vector value.
-define void @inv_val_store_to_inv_address_conditional_
diff _values_ic(i32* %a, i64 %n, i32* %b, i32 %k) {
+define void @inv_val_store_to_inv_address_conditional_
diff _values_ic(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_
diff _values_ic(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
-; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
+; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope !0, !noalias !3
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP2]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32> [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[A]], align 4, !alias.scope !3
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT4]], ptr [[TMP1]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> [[BROADCAST_SPLAT4]], <4 x i32> [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope !3
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[I1]], align 8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I2]], [[K]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[I1]], align 4
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
 ; CHECK:       cond_store:
 ; CHECK-NEXT:    br label [[LATCH]]
@@ -72,7 +71,7 @@ define void @inv_val_store_to_inv_address_conditional_
diff _values_ic(i32* %a, i6
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]
-; CHECK-NEXT:    store i32 [[STOREVAL]], i32* [[A]], align 4
+; CHECK-NEXT:    store i32 [[STOREVAL]], ptr [[A]], align 4
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -87,10 +86,10 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
-  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %i2 = load i32, i32* %i1, align 8
+  %i1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %i2 = load i32, ptr %i1, align 8
   %cmp = icmp eq i32 %i2, %k
-  store i32 %ntrunc, i32* %i1
+  store i32 %ntrunc, ptr %i1
   br i1 %cmp, label %cond_store, label %cond_store_k
 
 cond_store:
@@ -101,7 +100,7 @@ cond_store_k:
 
 latch:
   %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
-  store i32 %storeval, i32* %a
+  store i32 %storeval, ptr %a
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -118,50 +117,50 @@ for.end:                                          ; preds = %for.body
 ; into the uniform address within the loop.
 ; Since the condition and the phi is loop invariant, they are LICM'ed after
 ; vectorization.
-define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {
+define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_inv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]]
-; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
+; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i64 3
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 3
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT10]]
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i64 3
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], align 4, !alias.scope !8, !noalias !11
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[A]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope !11
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[I1]], align 4
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
 ; CHECK:       cond_store:
 ; CHECK-NEXT:    br label [[LATCH]]
@@ -169,7 +168,7 @@ define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32*
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]
-; CHECK-NEXT:    store i32 [[STOREVAL]], i32* [[A]], align 4
+; CHECK-NEXT:    store i32 [[STOREVAL]], ptr [[A]], align 4
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -185,9 +184,9 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
-  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %i2 = load i32, i32* %i1, align 8
-  store i32 %ntrunc, i32* %i1
+  %i1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %i2 = load i32, ptr %i1, align 8
+  store i32 %ntrunc, ptr %i1
   br i1 %cmp, label %cond_store, label %cond_store_k
 
 cond_store:
@@ -198,7 +197,7 @@ cond_store_k:
 
 latch:
   %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
-  store i32 %storeval, i32* %a
+  store i32 %storeval, ptr %a
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -210,31 +209,31 @@ for.end:                                          ; preds = %for.body
 ; variant value stored to uniform address tests that the code gen extracts the
 ; last element from the variant vector and scalar stores it into the uniform
 ; address.
-define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
+define i32 @variant_val_store_to_inv_address(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-LABEL: @variant_val_store_to_inv_address(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
+; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !15
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope !15
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[A]], align 4, !alias.scope !18, !noalias !15
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4, !alias.scope !18, !noalias !15
 ; CHECK-NEXT:    [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -242,7 +241,7 @@ define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
@@ -251,9 +250,9 @@ define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[I0:%.*]] = phi i32 [ [[I3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
-; CHECK-NEXT:    store i32 [[I2]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[I1]], align 8
+; CHECK-NEXT:    store i32 [[I2]], ptr [[A]], align 4
 ; CHECK-NEXT:    [[I3]] = add i32 [[I0]], [[I2]]
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
@@ -273,9 +272,9 @@ entry:
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
   %i0 = phi i32 [ %i3, %for.body ], [ 0, %entry ]
-  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %i2 = load i32, i32* %i1, align 8
-  store i32 %i2, i32* %a
+  %i1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %i2 = load i32, ptr %i1, align 8
+  store i32 %i2, ptr %a
   %i3 = add i32 %i0, %i2
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n

diff  --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index f8e1931b84e2..0d82bea4c1fa 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -13,42 +13,42 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; address.
 
 
-; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b)
+; memory check is found.conflict = b[max(n-1,1)] > a && (ptr a)+1 > (ptr b)
 
 
-define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) {
 ; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
-; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
+; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
@@ -57,10 +57,10 @@ define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[I0:%.*]] = phi i32 [ [[I3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[I1]], align 8
 ; CHECK-NEXT:    [[I3]] = add i32 [[I0]], [[I2]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -78,10 +78,10 @@ entry:
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
   %i0 = phi i32 [ %i3, %for.body ], [ 0, %entry ]
-  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %i2 = load i32, i32* %i1, align 8
+  %i1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %i2 = load i32, ptr %i1, align 8
   %i3 = add i32 %i0, %i2
-  store i32 %ntrunc, i32* %a
+  store i32 %ntrunc, ptr %a
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -91,46 +91,46 @@ for.end:                                          ; preds = %for.body
   ret i32 %i4
 }
 
-define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) {
+define void @inv_val_store_to_inv_address(ptr %a, i64 %n, ptr %b) {
 ; CHECK-LABEL: @inv_val_store_to_inv_address(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
-; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
+; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !8, !noalias !11
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP1]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope !11
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[I1]], align 4
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -145,10 +145,10 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %i2 = load i32, i32* %i1, align 8
-  store i32 %ntrunc, i32* %a
-  store i32 %ntrunc, i32* %i1
+  %i1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %i2 = load i32, ptr %i1, align 8
+  store i32 %ntrunc, ptr %a
+  store i32 %ntrunc, ptr %i1
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -167,78 +167,77 @@ for.end:                                          ; preds = %for.body
 
 
 
-define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
-; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
+; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SMAX]], 2
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8, !alias.scope !15, !noalias !18
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP3]], align 4, !alias.scope !15, !noalias !18
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT4]], ptr [[TMP1]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !18
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
-; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !18
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !18
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
 ; CHECK:       pred.store.if9:
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope !18
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
 ; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
-; CHECK:       pred.store.if11:
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
-; CHECK:       pred.store.if13:
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
-; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[I1]], align 8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I2]], [[K]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[I1]], align 4
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
 ; CHECK:       cond_store:
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[A]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
@@ -255,14 +254,14 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
-  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %i2 = load i32, i32* %i1, align 8
+  %i1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %i2 = load i32, ptr %i1, align 8
   %cmp = icmp eq i32 %i2, %k
-  store i32 %ntrunc, i32* %i1
+  store i32 %ntrunc, ptr %i1
   br i1 %cmp, label %cond_store, label %latch
 
 cond_store:
-  store i32 %ntrunc, i32* %a
+  store i32 %ntrunc, ptr %a
   br label %latch
 
 latch:
@@ -279,17 +278,17 @@ for.end:                                          ; preds = %for.body
 ; else a = k;
 ; TODO: We could vectorize this once we support multiple uniform stores to the
 ; same address.
-define void @inv_val_store_to_inv_address_conditional_
diff _values(i32* %a, i64 %n, i32* %b, i32 %k) {
+define void @inv_val_store_to_inv_address_conditional_
diff _values(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_
diff _values(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]]
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[I1]], align 8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I2]], [[K:%.*]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], ptr [[I1]], align 4
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
 ; CHECK:       cond_store:
 ; CHECK-NEXT:    br label [[LATCH]]
@@ -297,7 +296,7 @@ define void @inv_val_store_to_inv_address_conditional_
diff _values(i32* %a, i64 %
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[K]], [[COND_STORE_K]] ], [ [[NTRUNC]], [[COND_STORE]] ]
-; CHECK-NEXT:    store i32 [[STOREMERGE]], i32* [[A:%.*]], align 4
+; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
@@ -310,18 +309,18 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
-  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %i2 = load i32, i32* %i1, align 8
+  %i1 = getelementptr inbounds i32, ptr %b, i64 %i
+  %i2 = load i32, ptr %i1, align 8
   %cmp = icmp eq i32 %i2, %k
-  store i32 %ntrunc, i32* %i1
+  store i32 %ntrunc, ptr %i1
   br i1 %cmp, label %cond_store, label %cond_store_k
 
 cond_store:
-  store i32 %ntrunc, i32* %a
+  store i32 %ntrunc, ptr %a
   br label %latch
 
 cond_store_k:
-  store i32 %k, i32 * %a
+  store i32 %k, ptr %a
   br label %latch
 
 latch:
@@ -342,7 +341,7 @@ for.end:                                          ; preds = %for.body
 ;    }
 ;  }
 
-define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
 ; CHECK-LABEL: @multiple_uniform_stores(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -355,17 +354,17 @@ define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonl
 ; CHECK-NEXT:    [[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]]
 ; CHECK-NEXT:    br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[VAR1:%.*]], i64 [[INDVARS_IV23]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[VAR1:%.*]], i64 [[INDVARS_IV23]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[J_022]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY3_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[VAR2:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VAR2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i32 [[ADD]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]]
@@ -394,20 +393,20 @@ for.cond1.preheader:                              ; preds = %entry, %for.inc8
   br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
 
 for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
-  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
+  %arrayidx5 = getelementptr inbounds i32, ptr %var1, i64 %indvars.iv23
   %0 = zext i32 %j.022 to i64
   br label %for.body3
 
 for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
   %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx, align 4
-  %2 = load i32, i32* %arrayidx5, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %var2, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = load i32, ptr %arrayidx5, align 4
   %add = add nsw i32 %2, %1
-  store i32 %add, i32* %arrayidx5, align 4
-  %3 = load i32, i32* %arrayidx5, align 4
+  store i32 %add, ptr %arrayidx5, align 4
+  %3 = load i32, ptr %arrayidx5, align 4
   %4 = add nsw i32 %3, 1
-  store i32 %4, i32* %arrayidx5, align 4
+  store i32 %4, ptr %arrayidx5, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %itr
@@ -426,7 +425,7 @@ for.end10:                                        ; preds = %for.inc8, %entry
 
 ; second uniform store to the same address is conditional.
 ; we do not vectorize this.
-define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
 ; CHECK-LABEL: @multiple_uniform_stores_conditional(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -439,14 +438,14 @@ define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocap
 ; CHECK-NEXT:    [[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]]
 ; CHECK-NEXT:    br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[VAR1:%.*]], i64 [[INDVARS_IV23]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[VAR1:%.*]], i64 [[INDVARS_IV23]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[J_022]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY3_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[VAR2:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VAR2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i32 [[ADD]], 42
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[COND_STORE:%.*]], label [[LATCH]]
@@ -455,7 +454,7 @@ define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocap
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[TMP4]], [[COND_STORE]] ], [ [[ADD]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    store i32 [[STOREMERGE]], i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]]
@@ -484,24 +483,24 @@ for.cond1.preheader:                              ; preds = %entry, %for.inc8
   br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
 
 for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
-  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
+  %arrayidx5 = getelementptr inbounds i32, ptr %var1, i64 %indvars.iv23
   %0 = zext i32 %j.022 to i64
   br label %for.body3
 
 for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
   %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %latch ]
-  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx, align 4
-  %2 = load i32, i32* %arrayidx5, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %var2, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = load i32, ptr %arrayidx5, align 4
   %add = add nsw i32 %2, %1
-  store i32 %add, i32* %arrayidx5, align 4
-  %3 = load i32, i32* %arrayidx5, align 4
+  store i32 %add, ptr %arrayidx5, align 4
+  %3 = load i32, ptr %arrayidx5, align 4
   %4 = add nsw i32 %3, 1
   %5 = icmp ugt i32 %3, 42
   br i1 %5, label %cond_store, label %latch
 
 cond_store:
-  store i32 %4, i32* %arrayidx5, align 4
+  store i32 %4, ptr %arrayidx5, align 4
   br label %latch
 
 latch:
@@ -526,10 +525,10 @@ for.end10:                                        ; preds = %for.inc8, %entry
 ; PR39653
 ; Note: %i10 could be replaced by phi(%arg4, %i12), a potentially vectorizable
 ; 1st-order-recurrence
-define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) {
+define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, ptr %arg3, i32 %arg4, i64 %arg5) {
 ; CHECK-LABEL: @unsafe_dep_uniform_load_store(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I6:%.*]] = getelementptr inbounds i16, i16* [[ARG3:%.*]], i64 [[ARG5:%.*]]
+; CHECK-NEXT:    [[I6:%.*]] = getelementptr inbounds i16, ptr [[ARG3:%.*]], i64 [[ARG5:%.*]]
 ; CHECK-NEXT:    br label [[BB7:%.*]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    [[I121:%.*]] = phi i32 [ [[ARG4:%.*]], [[BB:%.*]] ], [ [[I12:%.*]], [[BB7]] ]
@@ -542,13 +541,13 @@ define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16*
 ; CHECK-NEXT:    [[I15:%.*]] = trunc i64 [[I8]] to i32
 ; CHECK-NEXT:    [[I16:%.*]] = add i32 [[I15]], [[ARG:%.*]]
 ; CHECK-NEXT:    [[I17:%.*]] = zext i32 [[I16]] to i64
-; CHECK-NEXT:    [[I18:%.*]] = getelementptr inbounds i16, i16* [[I6]], i64 [[I17]]
-; CHECK-NEXT:    store i16 [[I14]], i16* [[I18]], align 2
+; CHECK-NEXT:    [[I18:%.*]] = getelementptr inbounds i16, ptr [[I6]], i64 [[I17]]
+; CHECK-NEXT:    store i16 [[I14]], ptr [[I18]], align 2
 ; CHECK-NEXT:    [[I19:%.*]] = add i32 [[I13]], [[I9]]
 ; CHECK-NEXT:    [[I20:%.*]] = trunc i32 [[I19]] to i16
 ; CHECK-NEXT:    [[I21:%.*]] = and i16 [[I20]], 255
-; CHECK-NEXT:    [[I22:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i64 [[I17]]
-; CHECK-NEXT:    store i16 [[I21]], i16* [[I22]], align 2
+; CHECK-NEXT:    [[I22:%.*]] = getelementptr inbounds i16, ptr [[ARG3]], i64 [[I17]]
+; CHECK-NEXT:    store i16 [[I21]], ptr [[I22]], align 2
 ; CHECK-NEXT:    [[I23]] = add nsw i32 [[I9]], 1
 ; CHECK-NEXT:    [[I24]] = add nuw nsw i64 [[I8]], 1
 ; CHECK-NEXT:    [[I25:%.*]] = icmp eq i64 [[I24]], [[ARG2:%.*]]
@@ -558,14 +557,14 @@ define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16*
 ;
 bb:
   %i = alloca i32
-  store i32 %arg4, i32* %i
-  %i6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5
+  store i32 %arg4, ptr %i
+  %i6 = getelementptr inbounds i16, ptr %arg3, i64 %arg5
   br label %bb7
 
 bb7:
   %i8 = phi i64 [ 0, %bb ], [ %i24, %bb7 ]
   %i9 = phi i32 [ %arg1, %bb ], [ %i23, %bb7 ]
-  %i10 = load i32, i32* %i
+  %i10 = load i32, ptr %i
   %i11 = mul nsw i32 %i9, %i10
   %i12 = srem i32 %i11, 65536
   %i13 = add nsw i32 %i12, %i9
@@ -573,17 +572,17 @@ bb7:
   %i15 = trunc i64 %i8 to i32
   %i16 = add i32 %arg, %i15
   %i17 = zext i32 %i16 to i64
-  %i18 = getelementptr inbounds i16, i16* %i6, i64 %i17
-  store i16 %i14, i16* %i18, align 2
+  %i18 = getelementptr inbounds i16, ptr %i6, i64 %i17
+  store i16 %i14, ptr %i18, align 2
   %i19 = add i32 %i13, %i9
   %i20 = trunc i32 %i19 to i16
   %i21 = and i16 %i20, 255
-  %i22 = getelementptr inbounds i16, i16* %arg3, i64 %i17
-  store i16 %i21, i16* %i22, align 2
+  %i22 = getelementptr inbounds i16, ptr %arg3, i64 %i17
+  store i16 %i21, ptr %i22, align 2
   %i23 = add nsw i32 %i9, 1
   %i24 = add nuw nsw i64 %i8, 1
   %i25 = icmp eq i64 %i24, %arg2
-  store i32 %i12, i32* %i
+  store i32 %i12, ptr %i
   br i1 %i25, label %bb26, label %bb7
 
 bb26:

diff  --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll
index fe60ed251c52..e1816b8d4d0f 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-form.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll
@@ -4,7 +4,7 @@
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @bottom_tested(i16* %p, i32 %n) {
+define void @bottom_tested(ptr %p, i32 %n) {
 ; CHECK-LABEL: @bottom_tested(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0)
@@ -19,13 +19,12 @@ define void @bottom_tested(i16* %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
@@ -35,8 +34,8 @@ define void @bottom_tested(i16* %p, i32 %n) {
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_COND]] ]
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -65,16 +64,16 @@ define void @bottom_tested(i16* %p, i32 %n) {
 ; TAILFOLD-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; TAILFOLD:       pred.store.if:
 ; TAILFOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; TAILFOLD-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[TMP5]], align 4
+; TAILFOLD-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[TMP5]], align 4
 ; TAILFOLD-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; TAILFOLD:       pred.store.continue:
 ; TAILFOLD-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; TAILFOLD-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; TAILFOLD:       pred.store.if1:
 ; TAILFOLD-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; TAILFOLD-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP7]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[TMP8]], align 4
+; TAILFOLD-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP7]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[TMP8]], align 4
 ; TAILFOLD-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; TAILFOLD:       pred.store.continue2:
 ; TAILFOLD-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
@@ -89,8 +88,8 @@ define void @bottom_tested(i16* %p, i32 %n) {
 ; TAILFOLD:       for.cond:
 ; TAILFOLD-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_COND]] ]
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
 ; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -103,8 +102,8 @@ entry:
 for.cond:
   %i = phi i32 [ 0, %entry ], [ %inc, %for.cond ]
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   %cmp = icmp slt i32 %i, %n
   br i1 %cmp, label %for.cond, label %if.end
@@ -113,7 +112,7 @@ if.end:
   ret void
 }
 
-define void @early_exit(i16* %p, i32 %n) {
+define void @early_exit(ptr %p, i32 %n) {
 ; CHECK-LABEL: @early_exit(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0)
@@ -130,13 +129,12 @@ define void @early_exit(i16* %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -148,8 +146,8 @@ define void @early_exit(i16* %p, i32 %n) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       if.end:
@@ -164,8 +162,8 @@ define void @early_exit(i16* %p, i32 %n) {
 ; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; TAILFOLD:       for.body:
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    br label [[FOR_COND]]
 ; TAILFOLD:       if.end:
@@ -181,8 +179,8 @@ for.cond:
 
 for.body:
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   br label %for.cond
 
@@ -190,7 +188,7 @@ if.end:
   ret void
 }
 
-define i32 @early_exit_with_live_out(i32* %ptr) {
+define i32 @early_exit_with_live_out(ptr %ptr) {
 ; CHECK-LABEL: @early_exit_with_live_out(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -199,15 +197,13 @@ define i32 @early_exit_with_live_out(i32* %ptr) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> <i32 10, i32 10>, <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <2 x i32> <i32 10, i32 10>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -215,13 +211,13 @@ define i32 @early_exit_with_live_out(i32* %ptr) {
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    store i32 10, i32* [[GEP]], align 4
+; CHECK-NEXT:    store i32 10, ptr [[GEP]], align 4
 ; CHECK-NEXT:    br label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[L]], [[LOOP_HEADER]] ]
@@ -232,13 +228,13 @@ define i32 @early_exit_with_live_out(i32* %ptr) {
 ; TAILFOLD-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; TAILFOLD:       loop.header:
 ; TAILFOLD-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; TAILFOLD-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[IV]]
-; TAILFOLD-NEXT:    [[L:%.*]] = load i32, i32* [[GEP]], align 4
+; TAILFOLD-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[IV]]
+; TAILFOLD-NEXT:    [[L:%.*]] = load i32, ptr [[GEP]], align 4
 ; TAILFOLD-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TAILFOLD-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
 ; TAILFOLD-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; TAILFOLD:       loop.latch:
-; TAILFOLD-NEXT:    store i32 10, i32* [[GEP]], align 4
+; TAILFOLD-NEXT:    store i32 10, ptr [[GEP]], align 4
 ; TAILFOLD-NEXT:    br label [[LOOP_HEADER]]
 ; TAILFOLD:       exit:
 ; TAILFOLD-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[L]], [[LOOP_HEADER]] ]
@@ -249,14 +245,14 @@ entry:
 
 loop.header:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %gep = getelementptr i32, i32* %ptr, i64 %iv
-  %l = load i32, i32* %gep
+  %gep = getelementptr i32, ptr %ptr, i64 %iv
+  %l = load i32, ptr %gep
   %iv.next = add nuw nsw i64 %iv, 1
   %ec = icmp eq i64 %iv.next, 1000
   br i1 %ec, label %exit, label %loop.latch
 
 loop.latch:
-  store i32 10, i32* %gep
+  store i32 10, ptr %gep
   br label %loop.header
 
 exit:
@@ -266,7 +262,7 @@ exit:
 
 ; Same as early_exit, but with optsize to prevent the use of
 ; a scalar epilogue.  -- Can't vectorize this in either case.
-define void @optsize(i16* %p, i32 %n) optsize {
+define void @optsize(ptr %p, i32 %n) optsize {
 ; CHECK-LABEL: @optsize(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
@@ -276,8 +272,8 @@ define void @optsize(i16* %p, i32 %n) optsize {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       if.end:
@@ -292,8 +288,8 @@ define void @optsize(i16* %p, i32 %n) optsize {
 ; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; TAILFOLD:       for.body:
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    br label [[FOR_COND]]
 ; TAILFOLD:       if.end:
@@ -309,8 +305,8 @@ for.cond:
 
 for.body:
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   br label %for.cond
 
@@ -320,7 +316,7 @@ if.end:
 
 
 ; multiple exit - no values inside the loop used outside
-define void @multiple_unique_exit(i16* %p, i32 %n) {
+define void @multiple_unique_exit(ptr %p, i32 %n) {
 ; CHECK-LABEL: @multiple_unique_exit(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0)
@@ -338,13 +334,12 @@ define void @multiple_unique_exit(i16* %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -356,8 +351,8 @@ define void @multiple_unique_exit(i16* %p, i32 %n) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -373,8 +368,8 @@ define void @multiple_unique_exit(i16* %p, i32 %n) {
 ; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; TAILFOLD:       for.body:
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; TAILFOLD-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]]
@@ -391,8 +386,8 @@ for.cond:
 
 for.body:
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   %cmp2 = icmp slt i32 %i, 2096
   br i1 %cmp2, label %for.cond, label %if.end
@@ -402,7 +397,7 @@ if.end:
 }
 
 ; multiple exit - with an lcssa phi
-define i32 @multiple_unique_exit2(i16* %p, i32 %n) {
+define i32 @multiple_unique_exit2(ptr %p, i32 %n) {
 ; CHECK-LABEL: @multiple_unique_exit2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0)
@@ -420,13 +415,12 @@ define i32 @multiple_unique_exit2(i16* %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -438,8 +432,8 @@ define i32 @multiple_unique_exit2(i16* %p, i32 %n) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -456,8 +450,8 @@ define i32 @multiple_unique_exit2(i16* %p, i32 %n) {
 ; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; TAILFOLD:       for.body:
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; TAILFOLD-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]]
@@ -475,8 +469,8 @@ for.cond:
 
 for.body:
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   %cmp2 = icmp slt i32 %i, 2096
   br i1 %cmp2, label %for.cond, label %if.end
@@ -486,7 +480,7 @@ if.end:
 }
 
 ; multiple exit w/a non lcssa phi
-define i32 @multiple_unique_exit3(i16* %p, i32 %n) {
+define i32 @multiple_unique_exit3(ptr %p, i32 %n) {
 ; CHECK-LABEL: @multiple_unique_exit3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0)
@@ -504,13 +498,12 @@ define i32 @multiple_unique_exit3(i16* %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -522,8 +515,8 @@ define i32 @multiple_unique_exit3(i16* %p, i32 %n) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -540,8 +533,8 @@ define i32 @multiple_unique_exit3(i16* %p, i32 %n) {
 ; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; TAILFOLD:       for.body:
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; TAILFOLD-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]]
@@ -559,8 +552,8 @@ for.cond:
 
 for.body:
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   %cmp2 = icmp slt i32 %i, 2096
   br i1 %cmp2, label %for.cond, label %if.end
@@ -571,7 +564,7 @@ if.end:
 }
 
 ; multiple exits w/distinct target blocks
-define i32 @multiple_exit_blocks(i16* %p, i32 %n) {
+define i32 @multiple_exit_blocks(ptr %p, i32 %n) {
 ; CHECK-LABEL: @multiple_exit_blocks(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0)
@@ -589,13 +582,12 @@ define i32 @multiple_exit_blocks(i16* %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -607,8 +599,8 @@ define i32 @multiple_exit_blocks(i16* %p, i32 %n) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -626,8 +618,8 @@ define i32 @multiple_exit_blocks(i16* %p, i32 %n) {
 ; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; TAILFOLD:       for.body:
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; TAILFOLD-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]]
@@ -646,8 +638,8 @@ for.cond:
 
 for.body:
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   %cmp2 = icmp slt i32 %i, 2096
   br i1 %cmp2, label %for.cond, label %if.end2
@@ -660,7 +652,7 @@ if.end2:
 }
 
 ; LCSSA, common value each exit
-define i32 @multiple_exit_blocks2(i16* %p, i32 %n) {
+define i32 @multiple_exit_blocks2(ptr %p, i32 %n) {
 ; CHECK-LABEL: @multiple_exit_blocks2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0)
@@ -678,13 +670,12 @@ define i32 @multiple_exit_blocks2(i16* %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -696,8 +687,8 @@ define i32 @multiple_exit_blocks2(i16* %p, i32 %n) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]], !llvm.loop [[LOOP17:![0-9]+]]
@@ -717,8 +708,8 @@ define i32 @multiple_exit_blocks2(i16* %p, i32 %n) {
 ; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; TAILFOLD:       for.body:
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; TAILFOLD-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]]
@@ -739,8 +730,8 @@ for.cond:
 
 for.body:
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   %cmp2 = icmp slt i32 %i, 2096
   br i1 %cmp2, label %for.cond, label %if.end2
@@ -753,7 +744,7 @@ if.end2:
 }
 
 ; LCSSA, distinct value each exit
-define i32 @multiple_exit_blocks3(i16* %p, i32 %n) {
+define i32 @multiple_exit_blocks3(ptr %p, i32 %n) {
 ; CHECK-LABEL: @multiple_exit_blocks3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 0)
@@ -771,13 +762,12 @@ define i32 @multiple_exit_blocks3(i16* %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -789,8 +779,8 @@ define i32 @multiple_exit_blocks3(i16* %p, i32 %n) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -810,8 +800,8 @@ define i32 @multiple_exit_blocks3(i16* %p, i32 %n) {
 ; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
 ; TAILFOLD:       for.body:
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
 ; TAILFOLD-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]]
@@ -832,8 +822,8 @@ for.cond:
 
 for.body:
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   %cmp2 = icmp slt i32 %i, 2096
   br i1 %cmp2, label %for.cond, label %if.end2
@@ -847,15 +837,15 @@ if.end2:
 
 ; unique exit case but with a switch as two edges between the same pair of
 ; blocks is an often missed edge case
-define i32 @multiple_exit_switch(i16* %p, i32 %n) {
+define i32 @multiple_exit_switch(ptr %p, i32 %n) {
 ; CHECK-LABEL: @multiple_exit_switch(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ]
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    switch i32 [[I]], label [[FOR_COND]] [
 ; CHECK-NEXT:    i32 2096, label [[IF_END:%.*]]
@@ -871,8 +861,8 @@ define i32 @multiple_exit_switch(i16* %p, i32 %n) {
 ; TAILFOLD:       for.cond:
 ; TAILFOLD-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ]
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    switch i32 [[I]], label [[FOR_COND]] [
 ; TAILFOLD-NEXT:    i32 2096, label [[IF_END:%.*]]
@@ -888,8 +878,8 @@ entry:
 for.cond:
   %i = phi i32 [ 0, %entry ], [ %inc, %for.cond ]
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   switch i32 %i, label %for.cond [
   i32 2096, label %if.end
@@ -902,15 +892,15 @@ if.end:
 
 ; multiple exit case but with a switch as multiple exiting edges from
 ; a single block is a commonly missed edge case
-define i32 @multiple_exit_switch2(i16* %p, i32 %n) {
+define i32 @multiple_exit_switch2(ptr %p, i32 %n) {
 ; CHECK-LABEL: @multiple_exit_switch2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ]
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; CHECK-NEXT:    switch i32 [[I]], label [[FOR_COND]] [
 ; CHECK-NEXT:    i32 2096, label [[IF_END:%.*]]
@@ -927,8 +917,8 @@ define i32 @multiple_exit_switch2(i16* %p, i32 %n) {
 ; TAILFOLD:       for.cond:
 ; TAILFOLD-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ]
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
 ; TAILFOLD-NEXT:    switch i32 [[I]], label [[FOR_COND]] [
 ; TAILFOLD-NEXT:    i32 2096, label [[IF_END:%.*]]
@@ -945,8 +935,8 @@ entry:
 for.cond:
   %i = phi i32 [ 0, %entry ], [ %inc, %for.cond ]
   %iprom = sext i32 %i to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %inc = add nsw i32 %i, 1
   switch i32 %i, label %for.cond [
   i32 2096, label %if.end
@@ -960,7 +950,7 @@ if.end2:
   ret i32 1
 }
 
-define i32 @multiple_latch1(i16* %p) {
+define i32 @multiple_latch1(ptr %p) {
 ; CHECK-LABEL: @multiple_latch1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -971,8 +961,8 @@ define i32 @multiple_latch1(i16* %p) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_BACKEDGE]], label [[FOR_SECOND:%.*]]
 ; CHECK:       for.second:
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I_02]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[CMPS:%.*]] = icmp sgt i32 [[INC]], 16
 ; CHECK-NEXT:    br i1 [[CMPS]], label [[FOR_BODY_BACKEDGE]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.backedge:
@@ -990,8 +980,8 @@ define i32 @multiple_latch1(i16* %p) {
 ; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_BODY_BACKEDGE]], label [[FOR_SECOND:%.*]]
 ; TAILFOLD:       for.second:
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I_02]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[CMPS:%.*]] = icmp sgt i32 [[INC]], 16
 ; TAILFOLD-NEXT:    br i1 [[CMPS]], label [[FOR_BODY_BACKEDGE]], label [[FOR_END:%.*]]
 ; TAILFOLD:       for.body.backedge:
@@ -1010,8 +1000,8 @@ for.body:
 
 for.second:
   %iprom = sext i32 %i.02 to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %cmps = icmp sgt i32 %inc, 16
   br i1 %cmps, label %for.body.backedge, label %for.end
 
@@ -1025,7 +1015,7 @@ for.end:
 
 ; two back branches - loop simplify with convert this to the same form
 ; as previous before vectorizer sees it, but show that.
-define i32 @multiple_latch2(i16* %p) {
+define i32 @multiple_latch2(ptr %p) {
 ; CHECK-LABEL: @multiple_latch2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -1038,8 +1028,8 @@ define i32 @multiple_latch2(i16* %p) {
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ; CHECK:       for.second:
 ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I_02]] to i64
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; CHECK-NEXT:    store i16 0, i16* [[B]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; CHECK-NEXT:    store i16 0, ptr [[B]], align 4
 ; CHECK-NEXT:    [[CMPS:%.*]] = icmp sgt i32 [[INC]], 16
 ; CHECK-NEXT:    br i1 [[CMPS]], label [[FOR_BODY_BACKEDGE]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
@@ -1057,8 +1047,8 @@ define i32 @multiple_latch2(i16* %p) {
 ; TAILFOLD-NEXT:    br label [[FOR_BODY]]
 ; TAILFOLD:       for.second:
 ; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I_02]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, i16* [[B]], align 4
+; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IPROM]]
+; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
 ; TAILFOLD-NEXT:    [[CMPS:%.*]] = icmp sgt i32 [[INC]], 16
 ; TAILFOLD-NEXT:    br i1 [[CMPS]], label [[FOR_BODY_BACKEDGE]], label [[FOR_END:%.*]]
 ; TAILFOLD:       for.end:
@@ -1075,8 +1065,8 @@ for.body:
 
 for.second:
   %iprom = sext i32 %i.02 to i64
-  %b = getelementptr inbounds i16, i16* %p, i64 %iprom
-  store i16 0, i16* %b, align 4
+  %b = getelementptr inbounds i16, ptr %p, i64 %iprom
+  store i16 0, ptr %b, align 4
   %cmps = icmp sgt i32 %inc, 16
   br i1 %cmps, label %for.body, label %for.end
 
@@ -1088,7 +1078,7 @@ for.end:
 ; Check interaction between block predication and early exits.  We need the
 ; condition on the early exit to remain dead (i.e. not be used when forming
 ; the predicate mask).
-define void @scalar_predication(float* %addr) {
+define void @scalar_predication(ptr %addr) {
 ; CHECK-LABEL: @scalar_predication(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1097,30 +1087,29 @@ define void @scalar_predication(float* %addr) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, float* [[ADDR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp oeq <2 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[ADDR:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <2 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr float, float* [[ADDR]], i64 [[TMP0]]
-; CHECK-NEXT:    store float 1.000000e+01, float* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[ADDR]], i64 [[TMP0]]
+; CHECK-NEXT:    store float 1.000000e+01, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.if1:
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[ADDR]], i64 [[TMP9]]
-; CHECK-NEXT:    store float 1.000000e+01, float* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[ADDR]], i64 [[TMP8]]
+; CHECK-NEXT:    store float 1.000000e+01, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1128,15 +1117,15 @@ define void @scalar_predication(float* %addr) {
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, float* [[ADDR]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[ADDR]], i64 [[IV]]
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[GEP]], align 4
-; CHECK-NEXT:    [[PRED:%.*]] = fcmp oeq float [[TMP12]], 0.000000e+00
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[PRED:%.*]] = fcmp oeq float [[TMP11]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[PRED]], label [[LOOP_LATCH]], label [[THEN:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    store float 1.000000e+01, float* [[GEP]], align 4
+; CHECK-NEXT:    store float 1.000000e+01, ptr [[GEP]], align 4
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -1149,15 +1138,15 @@ define void @scalar_predication(float* %addr) {
 ; TAILFOLD-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; TAILFOLD:       loop.header:
 ; TAILFOLD-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; TAILFOLD-NEXT:    [[GEP:%.*]] = getelementptr float, float* [[ADDR:%.*]], i64 [[IV]]
+; TAILFOLD-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[ADDR:%.*]], i64 [[IV]]
 ; TAILFOLD-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
 ; TAILFOLD-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]]
 ; TAILFOLD:       loop.body:
-; TAILFOLD-NEXT:    [[TMP0:%.*]] = load float, float* [[GEP]], align 4
+; TAILFOLD-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
 ; TAILFOLD-NEXT:    [[PRED:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00
 ; TAILFOLD-NEXT:    br i1 [[PRED]], label [[LOOP_LATCH]], label [[THEN:%.*]]
 ; TAILFOLD:       then:
-; TAILFOLD-NEXT:    store float 1.000000e+01, float* [[GEP]], align 4
+; TAILFOLD-NEXT:    store float 1.000000e+01, ptr [[GEP]], align 4
 ; TAILFOLD-NEXT:    br label [[LOOP_LATCH]]
 ; TAILFOLD:       loop.latch:
 ; TAILFOLD-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -1170,17 +1159,17 @@ entry:
 
 loop.header:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %gep = getelementptr float, float* %addr, i64 %iv
+  %gep = getelementptr float, ptr %addr, i64 %iv
   %exitcond.not = icmp eq i64 %iv, 200
   br i1 %exitcond.not, label %exit, label %loop.body
 
 loop.body:
-  %0 = load float, float* %gep, align 4
+  %0 = load float, ptr %gep, align 4
   %pred = fcmp oeq float %0, 0.0
   br i1 %pred, label %loop.latch, label %then
 
 then:
-  store float 10.0, float* %gep, align 4
+  store float 10.0, ptr %gep, align 4
   br label %loop.latch
 
 loop.latch:
@@ -1191,7 +1180,7 @@ exit:
   ret void
 }
 
-define i32 @me_reduction(i32* %addr) {
+define i32 @me_reduction(ptr %addr) {
 ; CHECK-LABEL: @me_reduction(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1199,32 +1188,31 @@ define i32 @me_reduction(i32* %addr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4]] = add <2 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = add <2 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]])
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[ADDR]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ADDR]], i64 [[IV]]
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[GEP]], align 4
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP6]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND2_NOT:%.*]] = icmp eq i64 [[IV]], 400
 ; CHECK-NEXT:    br i1 [[EXITCOND2_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]]
@@ -1238,11 +1226,11 @@ define i32 @me_reduction(i32* %addr) {
 ; TAILFOLD:       loop.header:
 ; TAILFOLD-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; TAILFOLD-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP_LATCH]] ]
-; TAILFOLD-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[IV]]
+; TAILFOLD-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i64 [[IV]]
 ; TAILFOLD-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
 ; TAILFOLD-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; TAILFOLD:       loop.latch:
-; TAILFOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* [[GEP]], align 4
+; TAILFOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
 ; TAILFOLD-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP0]]
 ; TAILFOLD-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TAILFOLD-NEXT:    [[EXITCOND2_NOT:%.*]] = icmp eq i64 [[IV]], 400
@@ -1257,12 +1245,12 @@ entry:
 loop.header:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
   %accum = phi i32 [0, %entry], [%accum.next, %loop.latch]
-  %gep = getelementptr i32, i32* %addr, i64 %iv
+  %gep = getelementptr i32, ptr %addr, i64 %iv
   %exitcond.not = icmp eq i64 %iv, 200
   br i1 %exitcond.not, label %exit, label %loop.latch
 
 loop.latch:
-  %0 = load i32, i32* %gep, align 4
+  %0 = load i32, ptr %gep, align 4
   %accum.next = add i32 %accum, %0
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond2.not = icmp eq i64 %iv, 400
@@ -1277,18 +1265,18 @@ exit:
 ; this.  There's an analogous single exit case where we extract the N-1
 ; value of the reduction that we can also handle.  If we fix the later, the
 ; multiple exit case probably falls out.
-define i32 @me_reduction2(i32* %addr) {
+define i32 @me_reduction2(ptr %addr) {
 ; CHECK-LABEL: @me_reduction2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP0]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    br label [[LOOP_HEADER]]
@@ -1302,11 +1290,11 @@ define i32 @me_reduction2(i32* %addr) {
 ; TAILFOLD:       loop.header:
 ; TAILFOLD-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; TAILFOLD-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP_LATCH]] ]
-; TAILFOLD-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[IV]]
+; TAILFOLD-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i64 [[IV]]
 ; TAILFOLD-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
 ; TAILFOLD-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
 ; TAILFOLD:       loop.latch:
-; TAILFOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* [[GEP]], align 4
+; TAILFOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
 ; TAILFOLD-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP0]]
 ; TAILFOLD-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TAILFOLD-NEXT:    br label [[LOOP_HEADER]]
@@ -1320,12 +1308,12 @@ entry:
 loop.header:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
   %accum = phi i32 [0, %entry], [%accum.next, %loop.latch]
-  %gep = getelementptr i32, i32* %addr, i64 %iv
+  %gep = getelementptr i32, ptr %addr, i64 %iv
   %exitcond.not = icmp eq i64 %iv, 200
   br i1 %exitcond.not, label %exit, label %loop.latch
 
 loop.latch:
-  %0 = load i32, i32* %gep, align 4
+  %0 = load i32, ptr %gep, align 4
   %accum.next = add i32 %accum, %0
   %iv.next = add nuw nsw i64 %iv, 1
   br label %loop.header

diff  --git a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
index 41680c933a7f..f5a174c2e01a 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
@@ -27,34 +27,35 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 %struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] }
 
-define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
+define void @Test(ptr nocapture %obj, i64 %z) #0 {
 ; CHECK-LABEL: @Test(
-; CHECK-NEXT:    [[OBJ4:%.*]] = bitcast %struct.s* [[OBJ:%.*]] to i8*
-; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr [[STRUCT_S:%.*]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[Z:%.*]]
-; CHECK-NEXT:    [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[Z:%.*]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 256
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[OBJ:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    br label [[DOTOUTER_PREHEADER:%.*]]
 ; CHECK:       .outer.preheader:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[I_NEXT:%.*]], [[DOTOUTER:%.*]] ]
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 0
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[Z]]
-; CHECK-NEXT:    [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8*
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 1, i64 [[I]]
-; CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[SCEVGEP9:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 1, i64 [[TMP1]]
-; CHECK-NEXT:    [[SCEVGEP910:%.*]] = bitcast i32* [[SCEVGEP9]] to i8*
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 1, i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[I]], 7
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 256
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OBJ]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[OBJ]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[I]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 128
+; CHECK-NEXT:    [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[OBJ]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP6]], 132
+; CHECK-NEXT:    [[UGLYGEP4:%.*]] = getelementptr i8, ptr [[OBJ]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[OBJ]], i64 0, i32 1, i64 [[I]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[Z]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP56]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[OBJ4]], [[SCEVGEP23]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[OBJ]], [[UGLYGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND011:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP910]]
-; CHECK-NEXT:    [[BOUND112:%.*]] = icmp ult i8* [[SCEVGEP78]], [[SCEVGEP23]]
-; CHECK-NEXT:    [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT13]]
+; CHECK-NEXT:    [[BOUND05:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP4]]
+; CHECK-NEXT:    [[BOUND16:%.*]] = icmp ult ptr [[UGLYGEP3]], [[UGLYGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[Z]], 4
@@ -62,25 +63,22 @@ define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP2]], align 4, !alias.scope !3
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 0, i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP13]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !5, !noalias !7
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP8]], [[WIDE_LOAD14]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !alias.scope !5, !noalias !7
+; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope !5, !noalias !7
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[TMP14]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP16]], align 4, !alias.scope !5, !noalias !7
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[Z]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOTOUTER]], label [[SCALAR_PH]]
@@ -95,14 +93,14 @@ define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
 ; CHECK-NEXT:    br i1 [[EXITCOND_OUTER]], label [[DOTEXIT:%.*]], label [[DOTOUTER_PREHEADER]]
 ; CHECK:       .inner:
 ; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[DOTINNER]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[J]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[J]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = add nsw i32 [[TMP18]], [[TMP20]]
-; CHECK-NEXT:    store i32 [[TMP21]], i32* [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 0, i64 [[J]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = add nsw i32 [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[J]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw i32 [[TMP22]], [[TMP24]]
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 1
 ; CHECK-NEXT:    [[EXITCOND_INNER:%.*]] = icmp eq i64 [[J_NEXT]], [[Z]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_INNER]], label [[DOTOUTER]], label [[DOTINNER]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -112,7 +110,7 @@ define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
 
 .outer.preheader:
   %i = phi i64 [ 0, %0 ], [ %i.next, %.outer ]
-  %1 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 1, i64 %i
+  %1 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 1, i64 %i
   br label %.inner
 
 .exit:
@@ -125,14 +123,14 @@ define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
 
 .inner:
   %j = phi i64 [ 0, %.outer.preheader ], [ %j.next, %.inner ]
-  %2 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 0, i64 %j
-  %3 = load i32, i32* %2
-  %4 = load i32, i32* %1
+  %2 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 0, i64 %j
+  %3 = load i32, ptr %2
+  %4 = load i32, ptr %1
   %5 = add nsw i32 %4, %3
-  %6 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 2, i64 %i, i64 %j
-  %7 = load i32, i32* %6
+  %6 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 2, i64 %i, i64 %j
+  %7 = load i32, ptr %6
   %8 = add nsw i32 %5, %7
-  store i32 %8, i32* %6
+  store i32 %8, ptr %6
   %j.next = add nuw nsw i64 %j, 1
   %exitcond.inner = icmp eq i64 %j.next, %z
   br i1 %exitcond.inner, label %.outer, label %.inner

diff  --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
index 708cf790f8c8..4a8fda99ef48 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
@@ -10,7 +10,7 @@
 target datalayout = "e-m:e-i64:64-n32:64"
 
 ; Some limited forms of live-outs (non-reduction, non-recurrences) are supported.
-define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
+define signext i32 @f1(ptr noalias %A, ptr noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK-LABEL: @f1(
 ; VF-TWO-CHECK-NEXT:  entry:
 ; VF-TWO-CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -29,20 +29,18 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK:       vector.body:
 ; VF-TWO-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF-TWO-CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; VF-TWO-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; VF-TWO-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; VF-TWO-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF-TWO-CHECK:       middle.block:
-; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
+; VF-TWO-CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
 ; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.iter.check:
@@ -55,22 +53,20 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF3]]
 ; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
-; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP10]]
-; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
-; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 2
-; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX6]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
+; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x i32>, ptr [[TMP12]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = add nsw <2 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; VF-TWO-CHECK:       vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
+; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
 ; VF-TWO-CHECK-NEXT:    [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N5]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-TWO-CHECK:       vec.epilog.scalar.ph:
@@ -78,16 +74,16 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VF-TWO-CHECK:       for.body:
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; VF-TWO-CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; VF-TWO-CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; VF-TWO-CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; VF-TWO-CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+; VF-TWO-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; VF-TWO-CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; VF-TWO-CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; VF-TWO-CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF-TWO-CHECK:       for.end.loopexit:
-; VF-TWO-CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; VF-TWO-CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_END]]
 ; VF-TWO-CHECK:       for.end:
 ; VF-TWO-CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_END_LOOPEXIT]] ]
@@ -103,10 +99,10 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
   %add = add nsw i32 %0, %1
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count

diff  --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 3f965d7eb09e..93cf55b3cb00 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -12,7 +12,7 @@
 
 target datalayout = "e-m:e-i64:64-n32:64-v128:128:128"
 
-define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) {
+define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 signext %N) {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -31,22 +31,19 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[BB:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[CC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[AA:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[BB:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[AA:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -60,24 +57,21 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias
 ; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF3]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP19]], <4 x float>* [[TMP22]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
-; CHECK-NEXT:    br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX6]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0
+; CHECK-NEXT:    store <4 x float> [[TMP14]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[CMP_N5]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -86,13 +80,13 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -110,13 +104,13 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       for.body:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[BB:%.*]], i64 [[INDVARS_IV]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[CC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[BB:%.*]], i64 [[INDVARS_IV]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], [[TMP1]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[AA:%.*]], i64 [[INDVARS_IV]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store float [[ADD]], float* [[ARRAYIDX4]], align 4
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[AA:%.*]], i64 [[INDVARS_IV]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store float [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
@@ -135,13 +129,13 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
-  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds float, ptr %bb, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, ptr %cc, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx2, align 4
   %add = fadd fast float %0, %1
-  %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
-  store float %add, float* %arrayidx4, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %aa, i64 %indvars.iv
+  store float %add, ptr %arrayidx4, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond, label %for.body, label %for.end.loopexit
@@ -153,7 +147,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
   ret void
 }
 
-define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signext %n) {
+define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) {
 ; CHECK-LABEL: @f2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 1
@@ -190,25 +184,23 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], -1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[N]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 -3
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP17]], <4 x float>* [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    store <4 x float> [[TMP16]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END5:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END4:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -219,45 +211,43 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC3]] to i32
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX8:%.*]] = trunc i64 [[OFFSET_IDX9]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX8]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX9]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP22]], -1
-; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP24]], [[N]]
-; CHECK-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP27]], i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP28]], i32 -3
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast float* [[TMP29]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP30]], align 4
-; CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP31:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast float* [[TMP33]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP31]], <4 x float>* [[TMP34]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[OFFSET_IDX9]], 4
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP35]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX8:%.*]] = trunc i64 [[INDEX7]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX8]], 0
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX7]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i32 [[TMP20]], -1
+; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], [[N]]
+; CHECK-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4
+; CHECK-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x float> [[WIDE_LOAD9]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <4 x float> [[REVERSE10]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
+; CHECK-NEXT:    store <4 x float> [[TMP28]], ptr [[TMP30]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[I_014]], -1
-; CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP36]], [[N]]
+; CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor i32 [[I_014]], -1
+; CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP32]], [[N]]
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP37:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP37]], 1.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[CONV3]], float* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP33]], 1.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[CONV3]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_014]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -281,11 +271,11 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP1:%.*]] = xor i32 [[I_014]], -1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[SUB2:%.*]] = add i32 [[TMP1]], [[N]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IDXPROM]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[IDXPROM]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP2]], 1.000000e+00
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store float [[CONV3]], float* [[ARRAYIDX5]], align 4
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store float [[CONV3]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INC]] = add nuw nsw i32 [[I_014]], 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -310,11 +300,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %1 = xor i32 %i.014, -1
   %sub2 = add i32 %1, %n
   %idxprom = sext i32 %sub2 to i64
-  %arrayidx = getelementptr inbounds float, float* %B, i64 %idxprom
-  %2 = load float, float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+  %2 = load float, ptr %arrayidx, align 4
   %conv3 = fadd fast float %2, 1.000000e+00
-  %arrayidx5 = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  store float %conv3, float* %arrayidx5, align 4
+  %arrayidx5 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
+  store float %conv3, ptr %arrayidx5, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %inc = add nuw nsw i32 %i.014, 1
   %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
@@ -327,7 +317,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
   ret i32 0
 }
 
-define void @f3(i8* noalias %A, i64 %n) {
+define void @f3(ptr noalias %A, i64 %n) {
 ; CHECK-LABEL: @f3(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
@@ -342,13 +332,12 @@ define void @f3(i8* noalias %A, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> <i8 1, i8 1, i8 1, i8 1>, <4 x i8>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <4 x i8> <i8 1, i8 1, i8 1, i8 1>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -362,15 +351,14 @@ define void @f3(i8* noalias %A, i64 %n) {
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> <i8 1, i8 1, i8 1, i8 1>, <4 x i8>* [[TMP8]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <4 x i8> <i8 1, i8 1, i8 1, i8 1>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -379,8 +367,8 @@ define void @f3(i8* noalias %A, i64 %n) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i8 1, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    store i8 1, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -403,13 +391,12 @@ define void @f3(i8* noalias %A, i64 %n) {
 ; CHECK-PROFITABLE-BY-DEFAULT:       vector.body:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> <i8 1, i8 1, i8 1, i8 1>, <4 x i8>* [[TMP3]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> <i8 1, i8 1, i8 1, i8 1>, ptr [[TMP2]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       middle.block:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -423,15 +410,14 @@ define void @f3(i8* noalias %A, i64 %n) {
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.vector.body:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP5]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> <i8 1, i8 1>, <2 x i8>* [[TMP8]], align 1
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 2
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> <i8 1, i8 1>, ptr [[TMP6]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], 2
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.middle.block:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -440,8 +426,8 @@ define void @f3(i8* noalias %A, i64 %n) {
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       for.body:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store i8 1, i8* [[ARRAYIDX]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store i8 1, ptr [[ARRAYIDX]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[N]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -455,8 +441,8 @@ entry:
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv
-  store i8 1, i8* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i8, ptr %A, i64 %iv
+  store i8 1, ptr %arrayidx, align 1
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp ne i64 %iv.next, %n
   br i1 %exitcond, label %for.body, label %for.end.loopexit
@@ -469,7 +455,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 }
 
 ; Test case for PR54745.
-define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst) {
+define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst) {
 ; CHECK-LABEL: @induction_resume_value_requires_non_trivial_scev_expansion(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
@@ -499,14 +485,13 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[VEC_IND]], <4 x i8>* [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <4 x i8> [[VEC_IND]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 84, 84
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUTER_LATCH]], label [[SCALAR_PH]]
@@ -518,8 +503,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[INNER]] ]
 ; CHECK-NEXT:    [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[INNER]] ]
 ; CHECK-NEXT:    [[IV_2_NEXT]] = sub i8 [[IV_2]], [[TRUNC_ADD]]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i8 [[IV_2]], i8* [[GEP_DST]], align 1
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i8 [[IV_2]], ptr [[GEP_DST]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ugt i64 [[IV]], 83
 ; CHECK-NEXT:    br i1 [[EC]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -558,14 +543,13 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP4]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> [[VEC_IND]], <4 x i8>* [[TMP7]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP4]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> [[VEC_IND]], ptr [[TMP6]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       middle.block:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 84, 84
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[CMP_N]], label [[OUTER_LATCH]], label [[SCALAR_PH]]
@@ -577,8 +561,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[INNER]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[INNER]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IV_2_NEXT]] = sub i8 [[IV_2]], [[TRUNC_ADD]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[IV]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store i8 [[IV_2]], i8* [[GEP_DST]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store i8 [[IV_2]], ptr [[GEP_DST]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[EC:%.*]] = icmp ugt i64 [[IV]], 83
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[EC]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -603,8 +587,8 @@ inner:
   %iv = phi i64 [ 1, %outer.header ], [ %iv.next, %inner ]
   %iv.2 = phi i8 [ 0, %outer.header ], [ %iv.2.next, %inner ]
   %iv.2.next = sub i8 %iv.2, %trunc.add
-  %gep.dst = getelementptr inbounds i8, i8* %dst, i64 %iv
-  store i8 %iv.2, i8* %gep.dst
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
+  store i8 %iv.2, ptr %gep.dst
   %iv.next = add nuw nsw i64 %iv, 1
   %ec = icmp ugt i64 %iv, 83
   br i1 %ec, label %outer.latch, label %inner

diff  --git a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
index d7a2b62e7748..91feee419622 100644
--- a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
@@ -51,44 +51,43 @@ define void @doit1(i32 %n, i32 %step) local_unnamed_addr {
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP4]], i8 [[TMP5]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i8 0, [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i8 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP3]], i1 [[TMP9]], i1 [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i8 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i1 [[TMP8]], i1 [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], 255
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i8 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = sext i8 [[TMP1]] to i32
-; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP15]], [[IDENT_CHECK]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP10]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP14]], [[IDENT_CHECK]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[STEP]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP19]]
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[STEP]], 4
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[STEP]], 4
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP22]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -101,8 +100,8 @@ define void @doit1(i32 %n, i32 %step) local_unnamed_addr {
 ; CHECK-NEXT:    [[P_09:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i32 [[P_09]], 24
 ; CHECK-NEXT:    [[CONV:%.*]] = ashr exact i32 [[SEXT]], 24
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CONV]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[CONV]], [[STEP]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -125,8 +124,8 @@ for.body:
   %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
   %sext = shl i32 %p.09, 24
   %conv = ashr exact i32 %sext, 24
-  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
-  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 %indvars.iv
+  store i32 %conv, ptr %arrayidx, align 4
   %add = add nsw i32 %conv, %step
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
@@ -176,43 +175,42 @@ define void @doit2(i32 %n, i32 %step) local_unnamed_addr  {
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP4]], i8 [[TMP5]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i8 0, [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP3]], i1 [[TMP9]], i1 false
-; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], 255
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i8 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = sext i8 [[TMP1]] to i32
-; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP15]], [[IDENT_CHECK]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i8 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], 255
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP13]], [[IDENT_CHECK]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[STEP]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP19]]
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[STEP]], 4
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 [[STEP]], 4
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP22]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -224,8 +222,8 @@ define void @doit2(i32 %n, i32 %step) local_unnamed_addr  {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[P_09:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[CONV:%.*]] = and i32 [[P_09]], 255
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CONV]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[CONV]], [[STEP]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -247,8 +245,8 @@ for.body:
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
   %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
   %conv = and i32 %p.09, 255
-  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
-  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 %indvars.iv
+  store i32 %conv, ptr %arrayidx, align 4
   %add = add nsw i32 %conv, %step
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
@@ -293,8 +291,8 @@ define void @doit3(i32 %n, i32 %step) local_unnamed_addr {
 ; CHECK-NEXT:    [[STEP_ADDR_010:%.*]] = phi i32 [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[STEP:%.*]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i32 [[P_012]], 24
 ; CHECK-NEXT:    [[CONV:%.*]] = ashr exact i32 [[SEXT]], 24
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CONV]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[CONV]], [[STEP_ADDR_010]]
 ; CHECK-NEXT:    [[ADD3]] = add nsw i32 [[STEP_ADDR_010]], 2
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -319,8 +317,8 @@ for.body:
   %step.addr.010 = phi i32 [ %add3, %for.body ], [ %step, %for.body.preheader ]
   %sext = shl i32 %p.012, 24
   %conv = ashr exact i32 %sext, 24
-  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
-  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 %indvars.iv
+  store i32 %conv, ptr %arrayidx, align 4
   %add = add nsw i32 %conv, %step.addr.010
   %add3 = add nsw i32 %step.addr.010, 2
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
@@ -374,41 +372,40 @@ define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr {
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP3]], i8 [[TMP4]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i8 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP2]], i1 [[TMP8]], i1 [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i8 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i8 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP2]], i1 [[TMP7]], i1 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], 255
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[CSTEP]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[CONV]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[CONV]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[CONV]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 [[CONV]], 4
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[CONV]], 4
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -421,8 +418,8 @@ define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr {
 ; CHECK-NEXT:    [[P_011:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i32 [[P_011]], 24
 ; CHECK-NEXT:    [[CONV2:%.*]] = ashr exact i32 [[SEXT]], 24
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CONV2]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[CONV2]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[CONV2]], [[CONV]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -446,8 +443,8 @@ for.body:
   %p.011 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
   %sext = shl i32 %p.011, 24
   %conv2 = ashr exact i32 %sext, 24
-  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
-  store i32 %conv2, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 %indvars.iv
+  store i32 %conv2, ptr %arrayidx, align 4
   %add = add nsw i32 %conv2, %conv
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count

diff  --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll
index aa7df10bc828..3a9aa6b5726c 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45259.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll
@@ -3,18 +3,18 @@
 
 ; Check that we can vectorize this loop without crashing.
 
-define i8 @widget(i8* %arr, i8 %t9) {
+define i8 @widget(ptr %arr, i8 %t9) {
 ; CHECK-LABEL: @widget(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARR1:%.*]] = ptrtoint i8* [[ARR:%.*]] to i64
+; CHECK-NEXT:    [[ARR1:%.*]] = ptrtoint ptr [[ARR:%.*]] to i64
 ; CHECK-NEXT:    br label [[BB6:%.*]]
 ; CHECK:       bb6:
-; CHECK-NEXT:    [[T1_0:%.*]] = phi i8* [ [[ARR]], [[BB:%.*]] ], [ null, [[BB6]] ]
+; CHECK-NEXT:    [[T1_0:%.*]] = phi ptr [ [[ARR]], [[BB:%.*]] ], [ null, [[BB6]] ]
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C]], label [[FOR_PREHEADER:%.*]], label [[BB6]]
 ; CHECK:       for.preheader:
-; CHECK-NEXT:    [[T1_0_LCSSA:%.*]] = phi i8* [ [[T1_0]], [[BB6]] ]
-; CHECK-NEXT:    [[T1_0_LCSSA2:%.*]] = ptrtoint i8* [[T1_0_LCSSA]] to i64
+; CHECK-NEXT:    [[T1_0_LCSSA:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
+; CHECK-NEXT:    [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA2]] to i32
@@ -42,16 +42,15 @@ define i8 @widget(i8* %arr, i8 %t9) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[VEC_IND]], <i8 1, i8 1, i8 1, i8 1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i8> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[ARR]], i8 [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp slt <4 x i8> [[TMP11]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i1> [[TMP14]] to <4 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP15]], <4 x i8>* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <4 x i8> [[TMP15]], ptr [[TMP16]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], <i8 4, i8 4, i8 4, i8 4>
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -61,11 +60,11 @@ define i8 @widget(i8* %arr, i8 %t9) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[IV_NEXT]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr [[ARR]], i8 [[IV_NEXT]]
 ; CHECK-NEXT:    [[T3_I:%.*]] = icmp slt i8 [[IV_NEXT]], [[T9]]
 ; CHECK-NEXT:    [[T3_I8:%.*]] = zext i1 [[T3_I]] to i8
-; CHECK-NEXT:    store i8 [[T3_I8]], i8* [[PTR]], align 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8* [[T1_0_LCSSA]], [[PTR]]
+; CHECK-NEXT:    store i8 [[T3_I8]], ptr [[PTR]], align 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[T1_0_LCSSA]], [[PTR]]
 ; CHECK-NEXT:    br i1 [[EC]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i8 [ [[IV_NEXT]], [[FOR_BODY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
@@ -75,7 +74,7 @@ bb:
   br label %bb6
 
 bb6:
-  %t1.0 = phi i8* [ %arr, %bb ], [ null, %bb6 ]
+  %t1.0 = phi ptr [ %arr, %bb ], [ null, %bb6 ]
   %c = call i1 @cond()
   br i1 %c, label %for.preheader, label %bb6
 
@@ -85,11 +84,11 @@ for.preheader:
 for.body:
   %iv = phi i8 [ %iv.next, %for.body ], [ 0, %for.preheader ]
   %iv.next = add i8 %iv, 1
-  %ptr = getelementptr inbounds i8, i8* %arr, i8 %iv.next
+  %ptr = getelementptr inbounds i8, ptr %arr, i8 %iv.next
   %t3.i = icmp slt i8 %iv.next, %t9
   %t3.i8 = zext i1 %t3.i to i8
-  store i8 %t3.i8, i8* %ptr
-  %ec = icmp eq i8* %t1.0, %ptr
+  store i8 %t3.i8, ptr %ptr
+  %ec = icmp eq ptr %t1.0, %ptr
   br i1 %ec, label %for.exit, label %for.body
 
 for.exit:

diff  --git a/llvm/test/Transforms/LoopVectorize/pr50686.ll b/llvm/test/Transforms/LoopVectorize/pr50686.ll
index b74df3a3ee3f..f1be6867ad94 100644
--- a/llvm/test/Transforms/LoopVectorize/pr50686.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr50686.ll
@@ -1,21 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s
 
-define void @m(i32* nocapture %p, i32* nocapture %p2, i32 %q) {
+define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) {
 ; CHECK-LABEL: @m(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = bitcast i32* [[P:%.*]] to i8*
-; CHECK-NEXT:    [[P23:%.*]] = bitcast i32* [[P2:%.*]] to i8*
-; CHECK-NEXT:    [[ARRAYIDX9_1:%.*]] = getelementptr inbounds i32, i32* [[P2]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX9_2:%.*]] = getelementptr inbounds i32, i32* [[P2]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX9_1:%.*]] = getelementptr inbounds i32, ptr [[P2:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX9_2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 2
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[P]], i64 63
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[P2]], i64 3
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[P1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[P23]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 252
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[P2]], i64 12
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[P]], [[UGLYGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[P2]], [[UGLYGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -23,25 +19,24 @@ define void @m(i32* nocapture %p, i32* nocapture %p2, i32 %q) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[P2]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope !0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> zeroinitializer, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX9_1]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9_2]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT9]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP9]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 63, 60
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END17:%.*]], label [[SCALAR_PH]]
@@ -50,14 +45,14 @@ define void @m(i32* nocapture %p, i32* nocapture %p2, i32 %q) {
 ; CHECK-NEXT:    br label [[FOR_COND5:%.*]]
 ; CHECK:       for.cond5:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND5]] ]
-; CHECK-NEXT:    [[I3:%.*]] = load i32, i32* [[P2]], align 4
+; CHECK-NEXT:    [[I3:%.*]] = load i32, ptr [[P2]], align 4
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[I3]]
-; CHECK-NEXT:    [[I4:%.*]] = load i32, i32* [[ARRAYIDX9_1]], align 4
+; CHECK-NEXT:    [[I4:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4
 ; CHECK-NEXT:    [[SUB_1:%.*]] = sub nsw i32 [[SUB]], [[I4]]
-; CHECK-NEXT:    [[I5:%.*]] = load i32, i32* [[ARRAYIDX9_2]], align 4
+; CHECK-NEXT:    [[I5:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4
 ; CHECK-NEXT:    [[SUB_2:%.*]] = sub nsw i32 [[SUB_1]], [[I5]]
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[SUB_2]], i32* [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[SUB_2]], ptr [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 63
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END17]], label [[FOR_COND5]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -65,20 +60,20 @@ define void @m(i32* nocapture %p, i32* nocapture %p2, i32 %q) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %arrayidx9.1 = getelementptr inbounds i32, i32* %p2, i64 1
-  %arrayidx9.2 = getelementptr inbounds i32, i32* %p2, i64 2
+  %arrayidx9.1 = getelementptr inbounds i32, ptr %p2, i64 1
+  %arrayidx9.2 = getelementptr inbounds i32, ptr %p2, i64 2
   br label %for.cond5
 
 for.cond5:                              ; preds = %entry, %for.cond5
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond5 ]
-  %i3 = load i32, i32* %p2, align 4
+  %i3 = load i32, ptr %p2, align 4
   %sub = sub nsw i32 0, %i3
-  %i4 = load i32, i32* %arrayidx9.1, align 4
+  %i4 = load i32, ptr %arrayidx9.1, align 4
   %sub.1 = sub nsw i32 %sub, %i4
-  %i5 = load i32, i32* %arrayidx9.2, align 4
+  %i5 = load i32, ptr %arrayidx9.2, align 4
   %sub.2 = sub nsw i32 %sub.1, %i5
-  %arrayidx14 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
-  store i32 %sub.2, i32* %arrayidx14, align 4
+  %arrayidx14 = getelementptr inbounds i32, ptr %p, i64 %indvars.iv
+  store i32 %sub.2, ptr %arrayidx14, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 63
   br i1 %exitcond, label %for.end17, label %for.cond5

diff  --git a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
index eb7e8316f66f..7e5204a784a0 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
 
-define void @test(float* %A, i32 %x) {
+define void @test(ptr %A, i32 %x) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
@@ -11,27 +11,25 @@ define void @test(float* %A, i32 %x) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[X]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP2]], [[X]]
-; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[WIDE_LOAD]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], [[X]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    store <4 x float> [[WIDE_LOAD]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 undef, undef
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -44,13 +42,13 @@ define void @test(float* %A, i32 %x) {
 ; CHECK-NEXT:    [[T_IV_NEXT:%.*]] = trunc i64 [[IV_NEXT]] to i32
 ; CHECK-NEXT:    [[MUL_IV_NEXT:%.*]] = mul i32 [[T_IV_NEXT]], [[X]]
 ; CHECK-NEXT:    [[IDX_1:%.*]] = zext i32 [[MUL_IV_NEXT]] to i64
-; CHECK-NEXT:    [[ARRAYIDX1215:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IDX_1]]
-; CHECK-NEXT:    [[LV:%.*]] = load float, float* [[ARRAYIDX1215]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1215:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDX_1]]
+; CHECK-NEXT:    [[LV:%.*]] = load float, ptr [[ARRAYIDX1215]], align 4
 ; CHECK-NEXT:    [[T_IV:%.*]] = trunc i64 [[IV]] to i32
 ; CHECK-NEXT:    [[MUL_IV:%.*]] = mul i32 [[T_IV]], [[X]]
 ; CHECK-NEXT:    [[IDX_2:%.*]] = zext i32 [[MUL_IV]] to i64
-; CHECK-NEXT:    [[ARRAYIDX1209:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IDX_2]]
-; CHECK-NEXT:    store float [[LV]], float* [[ARRAYIDX1209]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1209:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDX_2]]
+; CHECK-NEXT:    store float [[LV]], ptr [[ARRAYIDX1209]], align 4
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], undef
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       exit:
@@ -65,14 +63,14 @@ loop:                                     ; preds = %loop, %entry
   %t.iv.next = trunc i64 %iv.next to i32
   %mul.iv.next = mul i32 %t.iv.next, %x
   %idx.1 = zext i32 %mul.iv.next to i64
-  %arrayidx1215 = getelementptr inbounds float, float* %A, i64 %idx.1
-  %lv = load float, float* %arrayidx1215, align 4
+  %arrayidx1215 = getelementptr inbounds float, ptr %A, i64 %idx.1
+  %lv = load float, ptr %arrayidx1215, align 4
 
   %t.iv = trunc i64 %iv to i32
   %mul.iv = mul i32 %t.iv, %x
   %idx.2 = zext i32 %mul.iv to i64
-  %arrayidx1209 = getelementptr inbounds float, float* %A, i64 %idx.2
-  store float %lv, float* %arrayidx1209, align 4
+  %arrayidx1209 = getelementptr inbounds float, ptr %A, i64 %idx.2
+  store float %lv, ptr %arrayidx1209, align 4
   %ec = icmp eq i64 %iv.next, undef
   br i1 %ec, label %exit, label %loop
 

diff  --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
index 621c92f29e16..cf12fe4947a5 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
@@ -10,17 +10,17 @@
 ;        runtime checks.
 
 ; The relevant bounds for %gep.A are [%A, %A+4).
-define void @load_clamped_index(i32* %A, i32* %B, i32 %N) {
+define void @load_clamped_index(ptr %A, ptr %B, i32 %N) {
 ; CHECK-LABEL: @load_clamped_index(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64
-; CHECK-NEXT:    [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
@@ -31,20 +31,18 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = urem i32 [[TMP13]], 4
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = urem i32 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -54,14 +52,14 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[CLAMPED_INDEX:%.*]] = urem i32 [[IV]], 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[GEP_A]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[CLAMPED_INDEX]]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LV]], 10
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[IV]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP_B]], align 4
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_B]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -71,11 +69,11 @@ entry:
 loop:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
   %clamped.index = urem i32 %iv, 4
-  %gep.A = getelementptr inbounds i32, i32* %A, i32 %clamped.index
-  %lv = load i32, i32* %gep.A
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %clamped.index
+  %lv = load i32, ptr %gep.A
   %add = add i32 %lv, 10
-  %gep.B = getelementptr inbounds i32, i32* %B, i32 %iv
-  store i32 %add, i32* %gep.B
+  %gep.B = getelementptr inbounds i32, ptr %B, i32 %iv
+  store i32 %add, ptr %gep.B
   %iv.next = add nuw nsw i32 %iv, 1
   %cond = icmp eq i32 %iv.next, %N
   br i1 %cond, label %exit, label %loop
@@ -85,17 +83,17 @@ exit:
 }
 
 ; The relevant bounds for %gep.A are [%A, %A+4).
-define void @store_clamped_index(i32* %A, i32* %B, i32 %N) {
+define void @store_clamped_index(ptr %A, ptr %B, i32 %N) {
 ; CHECK-LABEL: @store_clamped_index(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B2:%.*]] = ptrtoint i32* [[B:%.*]] to i64
-; CHECK-NEXT:    [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[A1]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
@@ -106,20 +104,18 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = urem i32 [[TMP13]], 4
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = urem i32 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -129,14 +125,14 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[CLAMPED_INDEX:%.*]] = urem i32 [[IV]], 4
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[GEP_B]], align 4
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[GEP_B]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LV]], 10
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP_A]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[CLAMPED_INDEX]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -146,11 +142,11 @@ entry:
 loop:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
   %clamped.index = urem i32 %iv, 4
-  %gep.B = getelementptr inbounds i32, i32* %B, i32 %iv
-  %lv = load i32, i32* %gep.B
+  %gep.B = getelementptr inbounds i32, ptr %B, i32 %iv
+  %lv = load i32, ptr %gep.B
   %add = add i32 %lv, 10
-  %gep.A = getelementptr inbounds i32, i32* %A, i32 %clamped.index
-  store i32 %add, i32* %gep.A
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %clamped.index
+  store i32 %add, ptr %gep.A
   %iv.next = add nuw nsw i32 %iv, 1
   %cond = icmp eq i32 %iv.next, %N
   br i1 %cond, label %exit, label %loop
@@ -159,21 +155,21 @@ exit:
   ret void
 }
 
-define void @clamped_index_dependence_non_clamped(i32* %A, i32* %B, i32 %N) {
+define void @clamped_index_dependence_non_clamped(ptr %A, ptr %B, i32 %N) {
 ; CHECK-LABEL: @clamped_index_dependence_non_clamped(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[GEP_B]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[IV]]
-; CHECK-NEXT:    [[LV_A:%.*]] = load i32, i32* [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[LV_A:%.*]] = load i32, ptr [[GEP_A_1]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LV]], [[LV_A]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[CLAMPED_INDEX:%.*]] = urem i32 [[IV_NEXT]], 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP_A]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[CLAMPED_INDEX]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
@@ -184,16 +180,16 @@ entry:
 
 loop:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
-  %gep.B = getelementptr inbounds i32, i32* %B, i32 %iv
-  %lv = load i32, i32* %gep.B
-  %gep.A.1 = getelementptr inbounds i32, i32* %A, i32 %iv
-  %lv.A = load i32, i32* %gep.A.1
+  %gep.B = getelementptr inbounds i32, ptr %B, i32 %iv
+  %lv = load i32, ptr %gep.B
+  %gep.A.1 = getelementptr inbounds i32, ptr %A, i32 %iv
+  %lv.A = load i32, ptr %gep.A.1
   %add = add i32 %lv, %lv.A
 
   %iv.next = add nuw nsw i32 %iv, 1
   %clamped.index = urem i32 %iv.next, 4
-  %gep.A = getelementptr inbounds i32, i32* %A, i32 %clamped.index
-  store i32 %add, i32* %gep.A
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %clamped.index
+  store i32 %add, ptr %gep.A
   %cond = icmp eq i32 %iv.next, %N
   br i1 %cond, label %exit, label %loop
 
@@ -201,20 +197,20 @@ exit:
   ret void
 }
 
-define void @clamped_index_dependence_clamped_index(i32* %A, i32* %B, i32 %N) {
+define void @clamped_index_dependence_clamped_index(ptr %A, ptr %B, i32 %N) {
 ; CHECK-LABEL: @clamped_index_dependence_clamped_index(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[CLAMPED_INDEX_1:%.*]] = urem i32 [[IV]], 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[CLAMPED_INDEX_1]]
-; CHECK-NEXT:    [[LV_A:%.*]] = load i32, i32* [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[CLAMPED_INDEX_1]]
+; CHECK-NEXT:    [[LV_A:%.*]] = load i32, ptr [[GEP_A_1]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LV_A]], 10
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[CLAMPED_INDEX:%.*]] = urem i32 [[IV_NEXT]], 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP_A]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[CLAMPED_INDEX]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
@@ -226,14 +222,14 @@ entry:
 loop:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
   %clamped.index.1 = urem i32 %iv, 4
-  %gep.A.1 = getelementptr inbounds i32, i32* %A, i32 %clamped.index.1
-  %lv.A = load i32, i32* %gep.A.1
+  %gep.A.1 = getelementptr inbounds i32, ptr %A, i32 %clamped.index.1
+  %lv.A = load i32, ptr %gep.A.1
   %add = add i32 %lv.A, 10
 
   %iv.next = add nuw nsw i32 %iv, 1
   %clamped.index = urem i32 %iv.next, 4
-  %gep.A = getelementptr inbounds i32, i32* %A, i32 %clamped.index
-  store i32 %add, i32* %gep.A
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %clamped.index
+  store i32 %add, ptr %gep.A
   %cond = icmp eq i32 %iv.next, %N
   br i1 %cond, label %exit, label %loop
 
@@ -241,33 +237,31 @@ exit:
   ret void
 }
 
-define void @clamped_index_equal_dependence(i32* %A, i32* %B, i32 %N) {
+define void @clamped_index_equal_dependence(ptr %A, ptr %B, i32 %N) {
 ; CHECK-LABEL: @clamped_index_equal_dependence(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = urem i32 [[TMP10]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP15]], <2 x i32>* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = urem i32 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -277,13 +271,13 @@ define void @clamped_index_equal_dependence(i32* %A, i32* %B, i32 %N) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[CLAMPED_INDEX:%.*]] = urem i32 [[IV]], 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]]
-; CHECK-NEXT:    [[LV_A:%.*]] = load i32, i32* [[GEP_A]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[CLAMPED_INDEX]]
+; CHECK-NEXT:    [[LV_A:%.*]] = load i32, ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LV_A]], 10
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP_A]], align 4
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -293,12 +287,12 @@ entry:
 loop:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
   %clamped.index = urem i32 %iv, 4
-  %gep.A = getelementptr inbounds i32, i32* %A, i32 %clamped.index
-  %lv.A = load i32, i32* %gep.A
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %clamped.index
+  %lv.A = load i32, ptr %gep.A
   %add = add i32 %lv.A, 10
 
   %iv.next = add nuw nsw i32 %iv, 1
-  store i32 %add, i32* %gep.A
+  store i32 %add, ptr %gep.A
   %cond = icmp eq i32 %iv.next, %N
   br i1 %cond, label %exit, label %loop