[llvm] r358552 - Revert "Temporarily Revert "Add basic loop fusion pass.""

Eric Christopher via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 16 21:53:01 PDT 2019


Added: llvm/trunk/test/Transforms/LoopVectorize/X86/pr35432.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/pr35432.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/pr35432.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/pr35432.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s
+
+; The test checks that there is no assert caused by issue described in PR35432
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at a = common local_unnamed_addr global [192 x [192 x i32]] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define i32 @main() local_unnamed_addr #0 {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[S:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[I]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    store i32 0, i32* [[I]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[S]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull [[TMP1]])
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 (i32*, ...) bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull [[I]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I]], align 4
+; CHECK-NEXT:    [[STOREMERGE6:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    store i16 [[STOREMERGE6]], i16* [[S]], align 2
+; CHECK-NEXT:    [[CONV17:%.*]] = and i32 [[TMP2]], 65472
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[CONV17]], 0
+; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END12:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 -1, [[TMP2]]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[STOREMERGE_IN9:%.*]] = phi i32 [ [[TMP2]], [[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], [[FOR_INC9:%.*]] ]
+; CHECK-NEXT:    [[CONV52:%.*]] = and i32 [[STOREMERGE_IN9]], 255
+; CHECK-NEXT:    [[CMP63:%.*]] = icmp ult i32 [[TMP2]], [[CONV52]]
+; CHECK-NEXT:    br i1 [[CMP63]], label [[FOR_BODY8_LR_PH:%.*]], label [[FOR_INC9]]
+; CHECK:       for.body8.lr.ph:
+; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[STOREMERGE_IN9]] to i8
+; CHECK-NEXT:    [[DOTPROMOTED:%.*]] = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[CONV3]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 -1, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP7]], i32 [[TMP3]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[UMAX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP9]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i8 [[CONV3]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 -1, [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[UMAX1:%.*]] = select i1 [[TMP13]], i32 [[TMP3]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[UMAX1]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8
+; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP16]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = add i8 [[TMP10]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sub i8 [[TMP10]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ugt i8 [[TMP18]], [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult i8 [[TMP17]], [[TMP10]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 true, i1 [[TMP19]], i1 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i32 [[TMP15]], 255
+; CHECK-NEXT:    [[TMP23:%.*]] = or i1 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or i1 [[TMP23]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i1 false, [[TMP24]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP9]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP9]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i8 [[CONV3]], [[CAST_CRD]]
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[DOTPROMOTED]], i32 0
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP26]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc i32 [[INDEX]] to i8
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i8 [[CONV3]], [[TMP27]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> undef, i8 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i8> [[BROADCAST_SPLAT]], <i8 0, i8 -1, i8 -2, i8 -3>
+; CHECK-NEXT:    [[INDUCTION3:%.*]] = add <4 x i8> [[BROADCAST_SPLAT]], <i8 -4, i8 -5, i8 -6, i8 -7>
+; CHECK-NEXT:    [[TMP28:%.*]] = add i8 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = add i8 [[OFFSET_IDX]], -4
+; CHECK-NEXT:    [[TMP30]] = add <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP31]] = add <4 x i32> [[VEC_PHI2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP32:%.*]] = add i8 [[TMP28]], -1
+; CHECK-NEXT:    [[TMP33:%.*]] = add i8 [[TMP29]], -1
+; CHECK-NEXT:    [[TMP34:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP31]], [[TMP30]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX4]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP9]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[FOR_BODY8_LR_PH]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[DOTPROMOTED]], [[FOR_BODY8_LR_PH]] ], [ [[DOTPROMOTED]], [[VECTOR_SCEVCHECK]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY8:%.*]]
+; CHECK:       for.body8:
+; CHECK-NEXT:    [[INC5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY8]] ]
+; CHECK-NEXT:    [[C_04:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[FOR_BODY8]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[INC5]], 1
+; CHECK-NEXT:    [[DEC]] = add i8 [[C_04]], -1
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[DEC]] to i32
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp ult i32 [[TMP2]], [[CONV5]]
+; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY8]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE]], !llvm.loop !2
+; CHECK:       for.cond4.for.inc9_crit_edge:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_BODY8]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i32 [[INC_LCSSA]], i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+; CHECK-NEXT:    br label [[FOR_INC9]]
+; CHECK:       for.inc9:
+; CHECK-NEXT:    [[CONV10:%.*]] = and i32 [[STOREMERGE_IN9]], 65535
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[CONV10]], 1
+; CHECK-NEXT:    [[CONV1:%.*]] = and i32 [[ADD]], 65472
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CONV1]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END12_CRIT_EDGE:%.*]]
+; CHECK:       for.cond.for.end12_crit_edge:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INC9]] ]
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = trunc i32 [[ADD_LCSSA]] to i16
+; CHECK-NEXT:    store i16 [[STOREMERGE]], i16* [[S]], align 2
+; CHECK-NEXT:    br label [[FOR_END12]]
+; CHECK:       for.end12:
+; CHECK-NEXT:    [[CALL13:%.*]] = call i32 (i16*, ...) bitcast (i32 (...)* @foo to i32 (i16*, ...)*)(i16* nonnull [[S]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull [[TMP1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %i = alloca i32, align 4
+  %s = alloca i16, align 2
+  %0 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #3
+  store i32 0, i32* %i, align 4
+  %1 = bitcast i16* %s to i8*
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull %1) #3
+  %call = call i32 (i32*, ...) bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i) #3
+  %2 = load i32, i32* %i, align 4
+  %storemerge6 = trunc i32 %2 to i16
+  store i16 %storemerge6, i16* %s, align 2
+  %conv17 = and i32 %2, 65472
+  %cmp8 = icmp eq i32 %conv17, 0
+  br i1 %cmp8, label %for.body.lr.ph, label %for.end12
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc9
+  %storemerge.in9 = phi i32 [ %2, %for.body.lr.ph ], [ %add, %for.inc9 ]
+  %conv52 = and i32 %storemerge.in9, 255
+  %cmp63 = icmp ult i32 %2, %conv52
+  br i1 %cmp63, label %for.body8.lr.ph, label %for.inc9
+
+for.body8.lr.ph:                                  ; preds = %for.body
+  %conv3 = trunc i32 %storemerge.in9 to i8
+  %.promoted = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.body8.lr.ph, %for.body8
+  %inc5 = phi i32 [ %.promoted, %for.body8.lr.ph ], [ %inc, %for.body8 ]
+  %c.04 = phi i8 [ %conv3, %for.body8.lr.ph ], [ %dec, %for.body8 ]
+  %inc = add i32 %inc5, 1
+  %dec = add i8 %c.04, -1
+  %conv5 = zext i8 %dec to i32
+  %cmp6 = icmp ult i32 %2, %conv5
+  br i1 %cmp6, label %for.body8, label %for.cond4.for.inc9_crit_edge
+
+for.cond4.for.inc9_crit_edge:                     ; preds = %for.body8
+  %inc.lcssa = phi i32 [ %inc, %for.body8 ]
+  store i32 %inc.lcssa, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+  br label %for.inc9
+
+for.inc9:                                         ; preds = %for.cond4.for.inc9_crit_edge, %for.body
+  %conv10 = and i32 %storemerge.in9, 65535
+  %add = add nuw nsw i32 %conv10, 1
+  %conv1 = and i32 %add, 65472
+  %cmp = icmp eq i32 %conv1, 0
+  br i1 %cmp, label %for.body, label %for.cond.for.end12_crit_edge
+
+for.cond.for.end12_crit_edge:                     ; preds = %for.inc9
+  %add.lcssa = phi i32 [ %add, %for.inc9 ]
+  %storemerge = trunc i32 %add.lcssa to i16
+  store i16 %storemerge, i16* %s, align 2
+  br label %for.end12
+
+for.end12:                                        ; preds = %for.cond.for.end12_crit_edge, %entry
+  %call13 = call i32 (i16*, ...) bitcast (i32 (...)* @foo to i32 (i16*, ...)*)(i16* nonnull %s) #3
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull %1) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #3
+  ret i32 0
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare i32 @goo(...) local_unnamed_addr #2
+
+declare i32 @foo(...) local_unnamed_addr #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/pr36524.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/pr36524.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/pr36524.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/pr36524.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 2, i64 3, i64 4, i64 5>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[OFFSET_IDX1:%.*]] = add i64 2, [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[OFFSET_IDX1]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+;
+entry:
+  br label %loop
+
+loop:
+  %0 = phi i64 [ 2, %entry ], [ %3, %loop ]
+  %1 = and i64 %0, 4294967295
+  %2 = trunc i64 %0 to i32
+  %3 = add nuw nsw i64 %1, 1
+  %4 = icmp sgt i32 %2, 80
+  br i1 %4, label %exit, label %loop
+
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/pr39160.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/pr39160.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/pr39160.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/pr39160.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,98 @@
+; RUN: opt -loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure that we can compile the test without crash.
+define void @barney() {
+
+; CHECK-LABEL: @barney(
+; CHECK:       middle.block:
+
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %tmp4 = icmp slt i32 undef, 0
+  br i1 %tmp4, label %bb2, label %bb5
+
+bb5:                                              ; preds = %bb2
+  br label %bb19
+
+bb18:                                             ; preds = %bb33
+  ret void
+
+bb19:                                             ; preds = %bb36, %bb5
+  %tmp21 = phi i64 [ undef, %bb36 ], [ 2, %bb5 ]
+  %tmp22 = phi i32 [ %tmp65, %bb36 ], [ undef, %bb5 ]
+  br label %bb50
+
+bb33:                                             ; preds = %bb62
+  br i1 undef, label %bb18, label %bb36
+
+bb36:                                             ; preds = %bb33
+  br label %bb19
+
+bb46:                                             ; preds = %bb50
+  br i1 undef, label %bb48, label %bb59
+
+bb48:                                             ; preds = %bb46
+  %tmp49 = add i32 %tmp52, 14
+  ret void
+
+bb50:                                             ; preds = %bb50, %bb19
+  %tmp52 = phi i32 [ %tmp55, %bb50 ], [ %tmp22, %bb19 ]
+  %tmp53 = phi i64 [ %tmp56, %bb50 ], [ 1, %bb19 ]
+  %tmp54 = add i32 %tmp52, 12
+  %tmp55 = add i32 %tmp52, 13
+  %tmp56 = add nuw nsw i64 %tmp53, 1
+  %tmp58 = icmp ult i64 %tmp53, undef
+  br i1 %tmp58, label %bb50, label %bb46
+
+bb59:                                             ; preds = %bb46
+  br label %bb62
+
+bb62:                                             ; preds = %bb68, %bb59
+  %tmp63 = phi i32 [ %tmp65, %bb68 ], [ %tmp55, %bb59 ]
+  %tmp64 = phi i64 [ %tmp66, %bb68 ], [ %tmp56, %bb59 ]
+  %tmp65 = add i32 %tmp63, 13
+  %tmp66 = add nuw nsw i64 %tmp64, 1
+  %tmp67 = icmp ult i64 %tmp66, %tmp21
+  br i1 %tmp67, label %bb68, label %bb33
+
+bb68:                                             ; preds = %bb62
+  br label %bb62
+}
+
+define i32 @foo(i32 addrspace(1)* %p) {
+
+; CHECK-LABEL: foo
+; CHECK:       middle.block:
+
+entry:
+  br label %outer
+
+outer:                                            ; preds = %outer_latch, %entry
+  %iv = phi i64 [ 2, %entry ], [ %iv.next, %outer_latch ]
+  br label %inner
+
+inner:                                            ; preds = %inner, %outer
+  %0 = phi i32 [ %2, %inner ], [ 0, %outer ]
+  %a = phi i32 [ %3, %inner ], [ 1, %outer ]
+  %b = phi i32 [ %1, %inner ], [ 6, %outer ]
+  %1 = add i32 %b, 2
+  %2 = or i32 %0, %b
+  %3 = add nuw nsw i32 %a, 1
+  %4 = zext i32 %3 to i64
+  %5 = icmp ugt i64 %iv, %4
+  br i1 %5, label %inner, label %outer_latch
+
+outer_latch:                                      ; preds = %inner
+  store atomic i32 %2, i32 addrspace(1)* %p unordered, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %6 = icmp ugt i64 %iv, 63
+  br i1 %6, label %exit, label %outer
+
+exit:                                             ; preds = %outer_latch
+  ret i32 0
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/propagate-metadata.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/propagate-metadata.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/propagate-metadata.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/propagate-metadata.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt -S -mtriple="x86_64-unknown-linux-gnu" -loop-vectorize < %s | FileCheck %s
+
+; Don't crash on unknown metadata
+; CHECK-LABEL: @no_propagate_range_metadata(
+; CHECK: load <16 x i8>
+; CHECK: store <16 x i8>
+define void @no_propagate_range_metadata(i8* readonly %first.coerce, i8* readnone %last.coerce, i8* nocapture %result) {
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %result.addr.05 = phi i8* [ %incdec.ptr, %for.body ], [ %result, %for.body.preheader ]
+  %first.sroa.0.04 = phi i8* [ %incdec.ptr.i.i.i, %for.body ], [ %first.coerce, %for.body.preheader ]
+  %0 = load i8, i8* %first.sroa.0.04, align 1, !range !0
+  store i8 %0, i8* %result.addr.05, align 1
+  %incdec.ptr.i.i.i = getelementptr inbounds i8, i8* %first.sroa.0.04, i64 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %result.addr.05, i64 1
+  %lnot.i = icmp eq i8* %incdec.ptr.i.i.i, %last.coerce
+  br i1 %lnot.i, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  ret void
+}
+
+!0 = !{i8 0, i8 2}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt -loop-vectorize -S %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f(i128 %p1) {
+entry:
+  br label %while.body
+
+while.body:
+  %p.05 = phi i8* [ %add.ptr, %while.body ], [ null, %entry ]
+  %p1.addr.04 = phi i128 [ %sub, %while.body ], [ %p1, %entry ]
+  %add.ptr = getelementptr inbounds i8, i8* %p.05, i32 2
+  %sub = add nsw i128 %p1.addr.04, -2
+  %tobool = icmp eq i128 %sub, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/rauw-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/rauw-bug.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/rauw-bug.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/rauw-bug.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,33 @@
+; RUN: opt -slp-vectorizer -S %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; This test used to fail under libgmalloc. Because we would try to access a
+; pointer that was already deleted.
+;
+; llvm-lit -v --param use_gmalloc=1 --param
+;   gmalloc_path=/usr/lib/libgmalloc.dylib
+;   test/Transforms/LoopVectorize/X86/rauw-bug.ll
+;
+; radar://15498655
+
+; CHECK: reduced
+define void @reduced()  {
+entry:
+  br i1 undef, label %while.body, label %while.cond63.preheader.while.end76_crit_edge
+
+while.cond63.preheader.while.end76_crit_edge:
+  ret void
+
+while.body:
+  %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ]
+  %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ]
+  %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ]
+  %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ]
+  %div50 = fmul double %d3_fx.012, 1.250000e-01
+  %sub52 = fsub double 0.000000e+00, %div50
+  %div56 = fmul double %d3_fy.013, 1.250000e-01
+  %sub58 = fsub double 0.000000e+00, %div56
+  br label %while.body
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-crash.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; RUN: opt -S -loop-vectorize -mcpu=prescott -disable-basicaa < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-apple-darwin"
+
+; PR15344
+define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
+; CHECK-LABEL: @test1(
+; CHECK: preheader
+; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+; CHECK: vector.memcheck
+
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb
+  %tmp = load double, double* null, align 8
+  br i1 undef, label %bb3, label %bb12
+
+bb3:                                              ; preds = %bb3, %bb2
+  %tmp4 = phi double [ %tmp9, %bb3 ], [ %tmp, %bb2 ]
+  %tmp5 = phi i32 [ %tmp8, %bb3 ], [ 0, %bb2 ]
+  %tmp6 = getelementptr inbounds [16 x double], [16 x double]* undef, i32 0, i32 %tmp5
+  %tmp7 = load double, double* %tmp6, align 4
+  %tmp8 = add nsw i32 %tmp5, 1
+  %tmp9 = fadd fast double %tmp4, undef
+  %tmp10 = getelementptr inbounds float, float* %arg, i32 %tmp5
+  store float undef, float* %tmp10, align 4
+  %tmp11 = icmp eq i32 %tmp8, %arg1
+  br i1 %tmp11, label %bb12, label %bb3
+
+bb12:                                             ; preds = %bb3, %bb2
+  %tmp13 = phi double [ %tmp, %bb2 ], [ %tmp9, %bb3 ]
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,112 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define float @reduction_sum_float_ieee(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_ieee(
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK-NOT: %wide.load = load <4 x float>, <4 x float>*
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
+
+define float @reduction_sum_float_fastmath(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_fastmath(
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd fast float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
+
+define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_only_reassoc(
+; CHECK-NOT: fadd fast
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd reassoc float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
+
+define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_only_reassoc_and_contract(
+; CHECK-NOT: fadd fast
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd reassoc contract float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-small-size.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-small-size.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/reduction-small-size.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,80 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -debug-only=loop-vectorize -S < %s 2>&1  | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Make sure we ignore the costs of the redundant reduction casts
+; char reduction_i8(char *a, char *b, int n) {
+;   char sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += (a[i] + b[i]);
+;   return sum;
+; }
+;
+
+; CHECK-LABEL: reduction_i8
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = load
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = load
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = trunc
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = icmp
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   br
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = trunc
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = icmp
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   br
+;
+define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp.12 = icmp sgt i32 %n, 0
+  br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  %conv6 = trunc i32 %add5.lcssa to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i8 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
+  ret i8 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %sum.013 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %conv4 = and i32 %sum.013, 255
+  %add = add nuw nsw i32 %conv, %conv4
+  %add5 = add nuw nsw i32 %add, %conv3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -mtriple x86_64 -debug -disable-output 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Check that cost model is not executed twice for VF=2 when vectorization is
+; forced for a particular loop.
+
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
+; CHECK: LV: Vector loop of width 2 costs: {{[0-9]+}}.
+
+define i32 @foo(i32* %A, i32 %n) {
+entry:
+  %cmp3.i = icmp eq i32 %n, 0
+  br i1 %cmp3.i, label %exit, label %for.body.i
+
+for.body.i:
+  %iv = phi i32 [ %add.i, %for.body.i ], [ 0, %entry ]
+  %ld_addr = getelementptr inbounds i32, i32* %A, i32 %iv
+  %0 = load i32, i32* %ld_addr, align 4
+  %val = add i32 %0, 1
+  store i32 %val, i32* %ld_addr, align 4
+  %add.i = add nsw i32 %iv, 1
+  %cmp.i = icmp eq i32 %add.i, %n
+  br i1 %cmp.i, label %exit, label %for.body.i, !llvm.loop !0
+
+exit:
+  %__init.addr.0.lcssa.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  ret i32 %__init.addr.0.lcssa.i
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,134 @@
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Test that the register usage estimation is not affected by the presence of
+; debug intrinsics.
+;
+; In the test below the values %0 and %r.08 are ended in the add instruction
+; preceding the call to the intrinsic, and will be recorded against the index
+; of the call instruction.  This means the debug intrinsic must be considered
+; when erasing instructions from the list of open-intervals.
+;
+; Tests generated from following source (with and without -g):
+
+; unsigned test(unsigned *a, unsigned n) {
+;   unsigned i, r = 0;
+;   for(i = 0; i < n; i++)
+;     r += a[i];
+;   return r;
+; }
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: LV: Checking a loop in "test_g"
+; CHECK: LV(REG): Found max usage: 2
+
+define i32 @test_g(i32* nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !12, metadata !16), !dbg !17
+  tail call void @llvm.dbg.value(metadata i32 %n, i64 0, metadata !13, metadata !16), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14, metadata !16), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14, metadata !16), !dbg !20
+  %cmp6 = icmp eq i32 %n, 0, !dbg !21
+  br i1 %cmp6, label %for.end, label %for.body.preheader, !dbg !25
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64, !dbg !21
+  br label %for.body, !dbg !27
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %r.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv, !dbg !27
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !27, !tbaa !28
+  %add = add i32 %0, %r.08, !dbg !32
+  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !15, metadata !16), !dbg !19
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !33
+  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !15, metadata !16), !dbg !19
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !21
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !25, !llvm.loop !35
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end, !dbg !38
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
+  ret i32 %r.0.lcssa, !dbg !38
+}
+
+; CHECK: LV: Checking a loop in "test"
+; CHECK: LV(REG): Found max usage: 2
+
+define i32 @test(i32* nocapture readonly %a, i32 %n) local_unnamed_addr {
+entry:
+  %cmp6 = icmp eq i32 %n, 0
+  br i1 %cmp6, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %r.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !28
+  %add = add i32 %0, %r.08
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
+  ret i32 %r.0.lcssa
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "test_g", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !10, !9}
+!9 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
+!11 = !{!12, !13, !14, !15}
+!12 = !DILocalVariable(name: "a", arg: 1, scope: !6, file: !1, line: 1, type: !10)
+!13 = !DILocalVariable(name: "n", arg: 2, scope: !6, file: !1, line: 1, type: !9)
+!14 = !DILocalVariable(name: "i", scope: !6, file: !1, line: 2, type: !9)
+!15 = !DILocalVariable(name: "r", scope: !6, file: !1, line: 2, type: !9)
+!16 = !DIExpression()
+!17 = !DILocation(line: 1, column: 27, scope: !6)
+!18 = !DILocation(line: 1, column: 39, scope: !6)
+!19 = !DILocation(line: 2, column: 15, scope: !6)
+!20 = !DILocation(line: 2, column: 12, scope: !6)
+!21 = !DILocation(line: 3, column: 16, scope: !22)
+!22 = !DILexicalBlockFile(scope: !23, file: !1, discriminator: 1)
+!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 3, column: 3)
+!24 = distinct !DILexicalBlock(scope: !6, file: !1, line: 3, column: 3)
+!25 = !DILocation(line: 3, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !1, discriminator: 1)
+!27 = !DILocation(line: 4, column: 10, scope: !23)
+!28 = !{!29, !29, i64 0}
+!29 = !{!"int", !30, i64 0}
+!30 = !{!"omnipotent char", !31, i64 0}
+!31 = !{!"Simple C/C++ TBAA"}
+!32 = !DILocation(line: 4, column: 7, scope: !23)
+!33 = !DILocation(line: 3, column: 22, scope: !34)
+!34 = !DILexicalBlockFile(scope: !23, file: !1, discriminator: 2)
+!35 = distinct !{!35, !36, !37}
+!36 = !DILocation(line: 3, column: 3, scope: !24)
+!37 = !DILocation(line: 4, column: 13, scope: !24)
+!38 = !DILocation(line: 5, column: 3, scope: !6)

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,135 @@
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F
+; REQUIRES: asserts
+
+ at a = global [1024 x i8] zeroinitializer, align 16
+ at b = global [1024 x i8] zeroinitializer, align 16
+
+define i32 @foo() {
+; This function has a loop of SAD pattern. Here we check when VF = 16 the
+; register usage doesn't exceed 16.
+;
+; CHECK-LABEL: foo
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 13
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %2 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %2, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i32 @goo() {
+; For indvars.iv used in a computating chain only feeding into getelementptr or cmp,
+; it will not have vector version and the vector register usage will not exceed the
+; available vector register number.
+; CHECK-LABEL: goo
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 13
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %tmp1 = add nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %tmp1
+  %tmp = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %tmp to i32
+  %tmp2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %tmp2
+  %tmp3 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %tmp3 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %tmp4 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %tmp4, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i64 @bar(i64* nocapture %a) {
+; CHECK-LABEL: bar
+; CHECK:       LV(REG): VF = 2
+; CHECK:       LV(REG): Found max usage: 3
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  %add2.lcssa = phi i64 [ %add2, %for.body ]
+  ret i64 %add2.lcssa
+
+for.body:
+  %i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ]
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012
+  %0 = load i64, i64* %arrayidx, align 8
+  %add = add nsw i64 %0, %i.012
+  store i64 %add, i64* %arrayidx, align 8
+  %add2 = add nsw i64 %add, %s.011
+  %inc = add nuw nsw i64 %i.012, 1
+  %exitcond = icmp eq i64 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+ at d = external global [0 x i64], align 8
+ at e = external global [0 x i32], align 4
+ at c = external global [0 x i32], align 4
+
+define void @hoo(i32 %n) {
+; For c[i] = e[d[i]] in the loop, e[d[i]] is not consecutive but its index %tmp can
+; be gathered into a vector. For VF == 16, the vector version of %tmp will be <16 x i64>
+; so the max usage of AVX512 vector register will be 2.
+; AVX512F-LABEL: bar
+; AVX512F:       LV(REG): VF = 16
+; AVX512F:       LV(REG): Found max usage: 2
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 %indvars.iv
+  %tmp = load i64, i64* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 %tmp
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 %indvars.iv
+  store i32 %tmp1, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/register-assumption.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/register-assumption.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/register-assumption.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/register-assumption.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -instcombine -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1() {
+entry:
+  %alloca = alloca float, align 4
+  br label %loop_exit.dim.11.critedge
+
+loop_exit.dim.11.critedge:                        ; preds = %loop_body.dim.0
+  %ptrint = ptrtoint float* %alloca to i64
+  %maskedptr = and i64 %ptrint, 4
+  %maskcond = icmp eq i64 %maskedptr, 0
+  br label %loop_header.dim.017.preheader
+
+loop_header.dim.017.preheader:                    ; preds = %loop_exit.dim.016, %loop_exit.dim.11.critedge
+  br label %loop_body.dim.018
+
+loop_body.dim.018:                                ; preds = %loop_body.dim.018, %loop_header.dim.017.preheader
+  %invar_address.dim.019.0135 = phi i64 [ 0, %loop_header.dim.017.preheader ], [ %0, %loop_body.dim.018 ]
+  call void @llvm.assume(i1 %maskcond)
+; CHECK:     call void @llvm.assume(
+; CHECK-NOT: call void @llvm.assume(
+  %0 = add nuw nsw i64 %invar_address.dim.019.0135, 1
+  %1 = icmp eq i64 %0, 256
+  br i1 %1, label %loop_header.dim.017.preheader, label %loop_body.dim.018
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #0
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+; This test checks vector GEP before scatter.
+; The code bellow crashed due to destroyed SSA while incorrect vectorization of
+; the GEP.
+
+ at d = global [10 x [10 x i32]] zeroinitializer, align 16
+ at c = external global i32, align 4
+ at a = external global i32, align 4
+ at b = external global i64, align 8
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @_Z3fn1v() #0 {
+; CHECK-LABEL: @_Z3fn1v(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %vector.ph ], [ [[VEC_IND_NEXT4:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+entry:
+  %0 = load i32, i32* @c, align 4
+  %cmp34 = icmp sgt i32 %0, 8
+  br i1 %cmp34, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %1 = load i32, i32* @a, align 4
+  %tobool = icmp eq i32 %1, 0
+  %2 = load i64, i64* @b, align 8
+  %mul = mul i64 %2, 4063299859190
+  %tobool6 = icmp eq i64 %mul, 0
+  %3 = sext i32 %0 to i64
+  br i1 %tobool, label %for.body.us.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.body.lr.ph
+  br label %for.body
+
+for.body.us.preheader:                            ; preds = %for.body.lr.ph
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.body.us.preheader, %for.cond.cleanup4.us-lcssa.us.us
+  %indvars.iv78 = phi i64 [ %indvars.iv.next79, %for.cond.cleanup4.us-lcssa.us.us ], [ 8, %for.body.us.preheader ]
+  %indvars.iv70 = phi i64 [ %indvars.iv.next71, %for.cond.cleanup4.us-lcssa.us.us ], [ 0, %for.body.us.preheader ]
+  %4 = sub nsw i64 8, %indvars.iv78
+  %add.ptr.us = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 %indvars.iv78
+  %5 = add nsw i64 %4, %indvars.iv70
+  %arraydecay.us.us.us = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr.us, i64 %5, i64 0
+  br i1 %tobool6, label %for.body5.us.us.us.preheader, label %for.body5.us.us48.preheader
+
+for.body5.us.us48.preheader:                      ; preds = %for.body.us
+  store i32 8, i32* %arraydecay.us.us.us, align 16
+  %indvars.iv.next66 = or i64 %indvars.iv70, 1
+  %6 = add nsw i64 %4, %indvars.iv.next66
+  %arraydecay.us.us55.1 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr.us, i64 %6, i64 0
+  store i32 8, i32* %arraydecay.us.us55.1, align 8
+  br label %for.cond.cleanup4.us-lcssa.us.us
+
+for.body5.us.us.us.preheader:                     ; preds = %for.body.us
+  store i32 7, i32* %arraydecay.us.us.us, align 16
+  %indvars.iv.next73 = or i64 %indvars.iv70, 1
+  %7 = add nsw i64 %4, %indvars.iv.next73
+  %arraydecay.us.us.us.1 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr.us, i64 %7, i64 0
+  store i32 7, i32* %arraydecay.us.us.us.1, align 8
+  br label %for.cond.cleanup4.us-lcssa.us.us
+
+for.cond.cleanup4.us-lcssa.us.us:                 ; preds = %for.body5.us.us48.preheader, %for.body5.us.us.us.preheader
+  %indvars.iv.next79 = add nuw nsw i64 %indvars.iv78, 2
+  %cmp.us = icmp slt i64 %indvars.iv.next79, %3
+  %indvars.iv.next71 = add nuw nsw i64 %indvars.iv70, 2
+  br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond.cleanup4.us-lcssa.us.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit99:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit99, %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv95 = phi i64 [ %indvars.iv.next96, %for.body ], [ 8, %for.body.preheader ]
+  %indvars.iv87 = phi i64 [ %indvars.iv.next88, %for.body ], [ 0, %for.body.preheader ]
+  %8 = sub nsw i64 8, %indvars.iv95
+  %add.ptr = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 %indvars.iv95
+  %9 = add nsw i64 %8, %indvars.iv87
+  %arraydecay.us31 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr, i64 %9, i64 0
+  store i32 8, i32* %arraydecay.us31, align 16
+  %indvars.iv.next90 = or i64 %indvars.iv87, 1
+  %10 = add nsw i64 %8, %indvars.iv.next90
+  %arraydecay.us31.1 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr, i64 %10, i64 0
+  store i32 8, i32* %arraydecay.us31.1, align 8
+  %indvars.iv.next96 = add nuw nsw i64 %indvars.iv95, 2
+  %cmp = icmp slt i64 %indvars.iv.next96, %3
+  %indvars.iv.next88 = add nuw nsw i64 %indvars.iv87, 2
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }

Propchange: llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll
------------------------------------------------------------------------------
    svn:executable = *

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,49 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s
+; REQUIRES: asserts
+; This test should not be vectorized in X86\SLM arch
+; Vectorizing the 64bit multiply in this case is wrong since
+; it can be done with a lower bit mode (notice that the sources is 16bit)
+; Also addq\subq (quad word) has a high cost on SLM arch.
+; this test has a bad performance (regression of -70%) if vectorized on SLM arch
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) {
+entry:
+; MSG: LV: Selecting VF: 1. 
+  %cmp17 = icmp sgt i32 %LastIndex, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv5 = sext i16 %Scale to i64
+  %sh_prom = and i64 %conv5, 4294967295
+  %0 = sext i16 %lag to i64
+  %wide.trip.count = zext i32 %LastIndex to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %conv8 = trunc i64 %add7 to i32
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ]
+  ret i32 %Accumulator.0.lcssa
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %InputData, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %1 to i64
+  %2 = add nsw i64 %indvars.iv, %0
+  %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2
+  %3 = load i16, i16* %arrayidx3, align 2 
+  %conv4 = sext i16 %3 to i64
+  %mul = mul nsw i64 %conv4, %conv
+  %shr = ashr i64 %mul, %sh_prom
+  %add7 = add i64 %shr, %Accumulator.018 
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,408 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+ at b = common global [2048 x i32] zeroinitializer, align 16
+ at c = common global [2048 x i32] zeroinitializer, align 16
+ at a = common global [2048 x i32] zeroinitializer, align 16
+ at G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+ at ub = common global [1024 x i32] zeroinitializer, align 16
+ at uc = common global [1024 x i32] zeroinitializer, align 16
+ at d = common global [2048 x i32] zeroinitializer, align 16
+ at fa = common global [1024 x float] zeroinitializer, align 16
+ at fb = common global [1024 x float] zeroinitializer, align 16
+ at ic = common global [1024 x i32] zeroinitializer, align 16
+ at da = common global [1024 x float] zeroinitializer, align 16
+ at db = common global [1024 x float] zeroinitializer, align 16
+ at dc = common global [1024 x float] zeroinitializer, align 16
+ at dd = common global [1024 x float] zeroinitializer, align 16
+ at dj = common global [1024 x i32] zeroinitializer, align 16
+
+; We can optimize this test without a tail.
+define void @example1() optsize {
+; CHECK-LABEL: @example1(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 16
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP10:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:         br i1 undef, label [[TMP10]], label [[TMP9]], !llvm.loop !2
+; CHECK:         ret void
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; Can vectorize in 'optsize' mode by masking the needed tail.
+define void @example2(i32 %n, i32 %x) optsize {
+; CHECK-LABEL: @example2(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH5_PREHEADER:%.*]], label [[DOTPREHEADER:%.*]]
+; CHECK:       .lr.ph5.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP10]], align 16
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP6]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP14]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP7]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP16]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret void
+;
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32, i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32, i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+; N is unknown, we need a tail. Can't vectorize because loop has no primary
+; induction.
+;CHECK-LABEL: @example3(
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32, i32* %.023, i64 1
+  %4 = load i32, i32* %.023, align 16
+  %5 = getelementptr inbounds i32, i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; We can't vectorize this one because we need a runtime ptr check.
+;CHECK-LABEL: @example23(
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
+; We CAN vectorize this example because the pointers are marked as noalias.
+define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+; CHECK-LABEL: @example23b(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[NEXT_GEP]] to <4 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[NEXT_GEP4]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7
+; CHECK:         ret void
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; We CAN vectorize this example by folding the tail it entails.
+define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+; CHECK-LABEL: @example23c(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], <i64 257, i64 257, i64 257, i64 257>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, i16* [[NEXT_GEP]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i16 [ undef, [[VECTOR_BODY]] ], [ [[TMP3]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; CHECK:       pred.load.if11:
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, i16* [[NEXT_GEP4]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; CHECK:       pred.load.continue12:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP7]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
+; CHECK:       pred.load.if13:
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[NEXT_GEP5]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.continue14:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP11]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
+; CHECK:       pred.load.if15:
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, i16* [[NEXT_GEP6]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
+; CHECK:       pred.load.continue16:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP15]], [[PRED_LOAD_IF15]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK:       pred.store.if17:
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw i32 [[TMP21]], 7
+; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP23]]
+; CHECK-NEXT:    store i32 [[TMP22]], i32* [[NEXT_GEP8]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK:       pred.store.if19:
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = shl nuw nsw i32 [[TMP25]], 7
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP27]]
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[NEXT_GEP9]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; CHECK:       pred.store.continue20:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.if21:
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i16 [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = shl nuw nsw i32 [[TMP29]], 7
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP31]]
+; CHECK-NEXT:    store i32 [[TMP30]], i32* [[NEXT_GEP10]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.continue22:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP33:%.*]]
+; CHECK:         br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9
+; CHECK:         ret void
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; We CAN'T vectorize this example because it would entail a tail and an
+; induction is used outside the loop.
+define i64 @example23d(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+;CHECK-LABEL: @example23d(
+; CHECK-NOT: <4 x
+; CHECK: ret i64
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i64 %7
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/strided_load_cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/strided_load_cost.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/strided_load_cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,54 @@
+; This test checks that the given loop still beneficial for vecotization
+; even if it contains scalarized load (gather on AVX2)
+;RUN: opt < %s -loop-vectorize -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 {
+entry:
+  %idxprom = sext i32 %i to i64
+  %idxprom5 = sext i32 %j to i64
+  br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add7
+
+  for.body:                                         ; preds = %for.body, %entry
+  ; the loop gets vectorized
+  ; first consecutive load as vector load
+  ; CHECK: %wide.load = load <8 x i32>
+  ; second strided load scalarized
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ]
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %idxprom, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx2, align 4, !tbaa !1
+  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %indvars.iv, i64 %idxprom5
+  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1
+  %mul = mul nsw i32 %1, %0
+  %add = add i32 %sum.015, 4
+  %add7 = add i32 %add, %mul
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 4.0.0 (cfe/trunk 284570)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/struct-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/struct-store.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/struct-store.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/struct-store.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S
+
+; Make sure we are not crashing on this one.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at glbl = external global [16 x { i64, i64 }], align 16
+
+declare void @fn()
+
+define void @test() {
+entry:
+  br label %loop
+
+loop:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
+  %tmp = getelementptr inbounds [16 x { i64, i64 }], [16 x { i64, i64 }]* @glbl, i64 0, i64 %indvars.iv
+  store { i64, i64 } { i64 ptrtoint (void ()* @fn to i64), i64 0 }, { i64, i64 }* %tmp, align 16
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 16
+  br i1 %exitcond, label %loop, label %exit
+
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,187 @@
+; RUN: opt -vector-library=SVML -loop-vectorize -S < %s | FileCheck %s
+
+; Test to verify that when math headers are built with
+; __FINITE_MATH_ONLY__ enabled, causing use of __<func>_finite
+; function versions, vectorization can map these to vector versions.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare float @__expf_finite(float) #0
+
+; CHECK-LABEL: @exp_f32
+; CHECK: <4 x float> @__svml_expf4
+; CHECK: ret
+define void @exp_f32(float* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__expf_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__exp_finite(double) #0
+
+; CHECK-LABEL: @exp_f64
+; CHECK: <4 x double> @__svml_exp4
+; CHECK: ret
+define void @exp_f64(double* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call fast double @__exp_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.vectorize.width", i32 4}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+
+
+declare float @__logf_finite(float) #0
+
+; CHECK-LABEL: @log_f32
+; CHECK: <4 x float> @__svml_logf4
+; CHECK: ret
+define void @log_f32(float* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__logf_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!21 = distinct !{!21, !22, !23}
+!22 = !{!"llvm.loop.vectorize.width", i32 4}
+!23 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__log_finite(double) #0
+
+; CHECK-LABEL: @log_f64
+; CHECK: <4 x double> @__svml_log4
+; CHECK: ret
+define void @log_f64(double* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call fast double @__log_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!31 = distinct !{!31, !32, !33}
+!32 = !{!"llvm.loop.vectorize.width", i32 4}
+!33 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare float @__powf_finite(float, float) #0
+
+; CHECK-LABEL: @pow_f32
+; CHECK: <4 x float> @__svml_powf4
+; CHECK: ret
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %indvars.iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call fast float @__powf_finite(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!41 = distinct !{!41, !42, !43}
+!42 = !{!"llvm.loop.vectorize.width", i32 4}
+!43 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__pow_finite(double, double) #0
+
+; CHECK-LABEL: @pow_f64
+; CHECK: <4 x double> @__svml_pow4
+; CHECK: ret
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %indvars.iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call fast double @__pow_finite(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!51 = distinct !{!51, !52, !53}
+!52 = !{!"llvm.loop.vectorize.width", i32 4}
+!53 = !{!"llvm.loop.vectorize.enable", i1 true}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/svml-calls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/svml-calls.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/svml-calls.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/svml-calls.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,501 @@
+; RUN: opt -vector-library=SVML -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare double @sin(double) #0
+declare float @sinf(float) #0
+declare double @llvm.sin.f64(double) #0
+declare float @llvm.sin.f32(float) #0
+
+declare double @cos(double) #0
+declare float @cosf(float) #0
+declare double @llvm.cos.f64(double) #0
+declare float @llvm.cos.f32(float) #0
+
+declare double @pow(double, double) #0
+declare float @powf(float, float) #0
+declare double @llvm.pow.f64(double, double) #0
+declare float @llvm.pow.f32(float, float) #0
+
+declare double @exp(double) #0
+declare float @expf(float) #0
+declare double @llvm.exp.f64(double) #0
+declare float @llvm.exp.f32(float) #0
+
+declare double @log(double) #0
+declare float @logf(float) #0
+declare double @llvm.log.f64(double) #0
+declare float @llvm.log.f32(float) #0
+
+
+define void @sin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sin(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sinf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.sin.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.sin.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cos(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @cosf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.cos.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.cos.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64(
+; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @pow(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64_intrinsic(
+; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32(
+; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @powf(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32_intrinsic(
+; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @expf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.exp.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.exp.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @logf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.log.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.log.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/tripcount.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/tripcount.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/tripcount.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/tripcount.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -mcpu=prescott < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-freebsd11.0"
+
+ at big = external global [0 x i32]
+
+; PR18049
+; We need to truncate the exit count to i32. This is legal because the
+; arithmetic is signed (%inc is nsw).
+
+; CHECK-LABEL: tripcount
+; CHECK: trunc i64 %count to i32
+
+define void @tripcount(i64 %count) {
+entry:
+  %cmp6 = icmp sgt i64 %count, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @big, i32 0, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %neg = xor i32 %0, -1
+  store i32 %neg, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.07, 1
+  %conv = sext i32 %inc to i64
+  %cmp = icmp slt i64 %conv, %count
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+
+; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 6 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
+  %tmp = load i64, i64* %arrayidx, align 4
+  %conv = uitofp i64 %tmp to double
+  %arrayidx2 = getelementptr inbounds double, double* %b, i64 %indvars.iv
+  store double %conv, double* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/uniform-phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/uniform-phi.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/uniform-phi.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/uniform-phi.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,99 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: test
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-DAG: LV: Found uniform instruction:   %exitcond = icmp eq i64 %indvars.iv, 1599
+
+define void @test(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %tmp0 = load float, float* %arrayidx, align 4
+  %add = fadd float %tmp0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; CHECK-LABEL: foo
+; CHECK-DAG: LV: Found uniform instruction:   %cond = icmp eq i64 %i.next, %n
+; CHECK-DAG: LV: Found uniform instruction:   %tmp1 = getelementptr inbounds i32, i32* %a, i32 %tmp0
+; CHECK-NOT: LV: Found uniform instruction:   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+
+define void @foo(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = trunc i64 %i to i32
+  %tmp1 = getelementptr inbounds i32, i32* %a, i32 %tmp0
+  store i32 %tmp0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: goo
+; Check %indvars.iv and %indvars.iv.next are uniform instructions even if they are used outside of loop.
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-DAG: LV: Found uniform instruction:   %exitcond = icmp eq i64 %indvars.iv, 1599
+
+define i64 @goo(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %tmp0 = load float, float* %arrayidx, align 4
+  %add = fadd float %tmp0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %retval = add i64 %indvars.iv, %indvars.iv.next
+  ret i64 %retval
+}
+
+; CHECK-LABEL: PR38786
+; Check that first order recurrence phis (%phi32 and %phi64) are not uniform.
+; CHECK-NOT: LV: Found uniform instruction:   %phi
+define void @PR38786(double* %y, double* %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %phi32 = phi i32 [ 0, %entry ], [ %i32next, %for.body ]
+  %phi64 = phi i64 [ 0, %entry ], [ %i64next, %for.body ]
+  %i32next = add i32 %phi32, 1
+  %i64next = zext i32 %i32next to i64
+  %xip = getelementptr inbounds double, double* %x, i64 %i64next
+  %yip = getelementptr inbounds double, double* %y, i64 %phi64
+  %xi = load double, double* %xip, align 8
+  store double %xi, double* %yip, align 8
+  %cmp = icmp slt i64 %i64next, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/uniform_load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/uniform_load.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/uniform_load.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/uniform_load.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,47 @@
+; RUN: opt -basicaa -loop-vectorize -S -mcpu=core-avx2 < %s | FileCheck %s
+
+;float inc = 0.5;
+;void foo(float *A, unsigned N) {
+;
+;  for (unsigned i=0; i<N; i++){
+;    A[i] += inc;
+;  }
+;}
+
+; CHECK-LABEL: foo
+; CHECK: vector.body
+; CHECK: load <8 x float>
+; CHECK: fadd <8 x float>
+; CHECK: store <8 x float>
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at inc = global float 5.000000e-01, align 4
+
+define void @foo(float* nocapture %A, i32 %N) #0 {
+entry:
+  %cmp3 = icmp eq i32 %N, 0
+  br i1 %cmp3, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %0 = load float, float* @inc, align 4
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/uniformshift.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/uniformshift.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/uniformshift.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/uniformshift.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: "foo"
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %shift = ashr i32 %val, %k
+define void @foo(i32* nocapture %p, i32 %k) local_unnamed_addr #0 {
+entry:  
+  br label %body
+
+body:
+  %i = phi i64 [ 0, %entry ], [ %next, %body ]
+  %ptr = getelementptr inbounds i32, i32* %p, i64 %i
+  %val = load i32, i32* %ptr, align 4
+  %shift = ashr i32 %val, %k
+  store i32 %shift, i32* %ptr, align 4
+  %next = add nuw nsw i64 %i, 1
+  %cmp = icmp eq i64 %next, 16
+  br i1 %cmp, label %exit, label %body
+
+exit:
+  ret void
+
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-pm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-pm.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-pm.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-pm.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -S | FileCheck %s
+; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -disable-loop-unrolling -S | FileCheck %s -check-prefix=CHECK-NOUNRL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+;CHECK-LABEL: @bar(
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+;CHECK-NOUNRL-LABEL: @bar(
+;CHECK-NOUNRL: store <4 x i32>
+;CHECK-NOUNRL-NOT: store <4 x i32>
+;CHECK-NOUNRL: ret
+define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,102 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -dce -S \
+; RUN:   | FileCheck %s --check-prefix=CHECK-VECTOR
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-interleave=0 -dce -S \
+; RUN:   | FileCheck %s --check-prefix=CHECK-SCALAR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; We don't unroll this loop because it has a small constant trip count.
+;
+; CHECK-VECTOR-LABEL: @foo(
+; CHECK-VECTOR: load <4 x i32>
+; CHECK-VECTOR-NOT: load <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR-NOT: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @foo(
+; CHECK-SCALAR: load i32, i32*
+; CHECK-SCALAR-NOT: load i32, i32*
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @foo(i32* nocapture %A) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  br i1 %exitcond, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 undef
+}
+
+; But this is a good small loop to unroll as we don't know of a bound on its
+; trip count.
+;
+; CHECK-VECTOR-LABEL: @bar(
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; For x86, loop unroll in loop vectorizer is disabled when VF==1.
+;
+; CHECK-SCALAR-LABEL: @bar(
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+; Also unroll if we need a runtime check but it was going to be added for
+; vectorization anyways.
+; CHECK-VECTOR-LABEL: @runtime_chk(
+; CHECK-VECTOR: store <4 x float>
+; CHECK-VECTOR: store <4 x float>
+;
+; But not if the unrolling would introduce the runtime check.
+; CHECK-SCALAR-LABEL: @runtime_chk(
+; CHECK-SCALAR: store float
+; CHECK-SCALAR-NOT: store float
+define void @runtime_chk(float* %A, float* %B, float %N) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, %N
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/unroll_selection.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/unroll_selection.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/unroll_selection.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/unroll_selection.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,71 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -dce -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Don't unroll when we have register pressure.
+;CHECK: reg_pressure
+;CHECK: load <4 x double>
+;CHECK-NOT: load  <4 x double>
+;CHECK: store <4 x double>
+;CHECK-NOT: store <4 x double>
+;CHECK: ret
+define void @reg_pressure(double* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = sext i32 %n to i64
+  br label %2
+
+; <label>:2                                       ; preds = %2, %0
+  %indvars.iv = phi i64 [ %indvars.iv.next, %2 ], [ %1, %0 ]
+  %3 = getelementptr inbounds double, double* %A, i64 %indvars.iv
+  %4 = load double, double* %3, align 8
+  %5 = fadd double %4, 3.000000e+00
+  %6 = fmul double %4, 2.000000e+00
+  %7 = fadd double %5, %6
+  %8 = fadd double %7, 2.000000e+00
+  %9 = fmul double %8, 5.000000e-01
+  %10 = fadd double %6, %9
+  %11 = fsub double %10, %5
+  %12 = fadd double %4, %11
+  %13 = fdiv double %8, %12
+  %14 = fmul double %13, %8
+  %15 = fmul double %6, %14
+  %16 = fmul double %5, %15
+  %17 = fadd double %16, -3.000000e+00
+  %18 = fsub double %4, %5
+  %19 = fadd double %6, %18
+  %20 = fadd double %13, %19
+  %21 = fadd double %20, %17
+  %22 = fadd double %21, 3.000000e+00
+  %23 = fmul double %4, %22
+  store double %23, double* %3, align 8
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %24 = trunc i64 %indvars.iv to i32
+  %25 = icmp eq i32 %24, 0
+  br i1 %25, label %26, label %2
+
+; <label>:26                                      ; preds = %2
+  ret void
+}
+
+; This is a small loop. Unroll it twice. 
+;CHECK: small_loop
+;CHECK: xor
+;CHECK: xor
+;CHECK: ret
+define void @small_loop(i16* nocapture %A, i64 %n) nounwind uwtable ssp {
+  %1 = icmp eq i64 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.01 = phi i64 [ %5, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i16, i16* %A, i64 %i.01
+  %3 = load i16, i16* %2, align 2
+  %4 = xor i16 %3, 3
+  store i16 %4, i16* %2, align 2
+  %5 = add i64 %i.01, 1
+  %exitcond = icmp eq i64 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/veclib-calls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/veclib-calls.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/veclib-calls.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/veclib-calls.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,632 @@
+; RUN: opt < %s -vector-library=Accelerate -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;CHECK-LABEL: @sqrt_f32(
+;CHECK: vsqrtf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @sqrtf(float) nounwind readnone
+define void @sqrt_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sqrtf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @exp_f32(
+;CHECK: vexpf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @expf(float) nounwind readnone
+define void @exp_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @expf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @log_f32(
+;CHECK: vlogf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @logf(float) nounwind readnone
+define void @log_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @logf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; For abs instruction we'll generate vector intrinsic, as it's cheaper than a lib call.
+;CHECK-LABEL: @fabs_f32(
+;CHECK: fabs{{.*}}<4 x float>
+;CHECK: ret void
+declare float @fabsf(float) nounwind readnone
+define void @fabs_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @fabsf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we can vectorize an intrinsic into a vector call.
+;CHECK-LABEL: @exp_f32_intrin(
+;CHECK: vexpf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.exp.f32(float) nounwind readnone
+define void @exp_f32_intrin(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.exp.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we don't vectorize arbitrary functions.
+;CHECK-LABEL: @foo_f32(
+;CHECK-NOT: foo{{.*}}<4 x float>
+;CHECK: ret void
+declare float @foo(float) nounwind readnone
+define void @foo_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @foo(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we don't vectorize calls with nobuiltin attribute.
+;CHECK-LABEL: @sqrt_f32_nobuiltin(
+;CHECK-NOT: vsqrtf{{.*}}<4 x float>
+;CHECK: ret void
+define void @sqrt_f32_nobuiltin(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sqrtf(float %0) nounwind readnone nobuiltin
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @ceil_f32(
+;CHECK: vceilf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @ceilf(float) nounwind readnone
+define void @ceil_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @ceilf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @floor_f32(
+;CHECK: vfloorf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @floorf(float) nounwind readnone
+define void @floor_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @floorf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @expm1_f32(
+;CHECK: vexpm1f{{.*}}<4 x float>
+;CHECK: ret void
+declare float @expm1f(float) nounwind readnone
+define void @expm1_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @expm1f(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @log1p_f32(
+;CHECK: vlog1pf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @log1pf(float) nounwind readnone
+define void @log1p_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @log1pf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @log10_f32(
+;CHECK: vlog10f{{.*}}<4 x float>
+;CHECK: ret void
+declare float @log10f(float) nounwind readnone
+define void @log10_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @log10f(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @logb_f32(
+;CHECK: vlogbf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @logbf(float) nounwind readnone
+define void @logb_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @logbf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @sin_f32(
+;CHECK: vsinf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @sinf(float) nounwind readnone
+define void @sin_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sinf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @cos_f32(
+;CHECK: vcosf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @cosf(float) nounwind readnone
+define void @cos_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @cosf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @tan_f32(
+;CHECK: vtanf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @tanf(float) nounwind readnone
+define void @tan_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @tanf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @asin_f32(
+;CHECK: vasinf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @asinf(float) nounwind readnone
+define void @asin_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @asinf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @acos_f32(
+;CHECK: vacosf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @acosf(float) nounwind readnone
+define void @acos_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @acosf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @atan_f32(
+;CHECK: vatanf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @atanf(float) nounwind readnone
+define void @atan_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @atanf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @sinh_f32(
+;CHECK: vsinhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @sinhf(float) nounwind readnone
+define void @sinh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sinhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @cosh_f32(
+;CHECK: vcoshf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @coshf(float) nounwind readnone
+define void @cosh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @coshf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @tanh_f32(
+;CHECK: vtanhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @tanhf(float) nounwind readnone
+define void @tanh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @tanhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @asinh_f32(
+;CHECK: vasinhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @asinhf(float) nounwind readnone
+define void @asinh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @asinhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @acosh_f32(
+;CHECK: vacoshf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @acoshf(float) nounwind readnone
+define void @acosh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @acoshf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @atanh_f32(
+;CHECK: vatanhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @atanhf(float) nounwind readnone
+define void @atanh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @atanhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,87 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: LV: Loop hints: force=enabled
+; CHECK: LV: Loop hints: force=?
+; No more loops in the module
+; CHECK-NOT: LV: Loop hints: force=
+; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of loops vectorized
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;
+; The source code for the test:
+;
+; #include <math.h>
+; void foo(float* restrict A, float * restrict B)
+; {
+;   for (int i = 0; i < 1000; i+=2) A[i] = sinf(B[i]);
+; }
+;
+
+;
+; This loop will be vectorized, although the scalar cost is lower than any of vector costs, but vectorization is explicitly forced in metadata.
+;
+
+define void @vectorized(float* noalias nocapture %A, float* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !11
+  %call = tail call float @llvm.sin.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4, !llvm.access.group !11
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !1
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2, !{!"llvm.loop.parallel_accesses", !11}}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!11 = distinct !{}
+
+;
+; This method will not be vectorized, as scalar cost is lower than any of vector costs.
+;
+
+define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
+  %call = tail call float @llvm.sin.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !3
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+declare float @llvm.sin.f32(float) nounwind readnone
+
+; Dummy metadata
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13}}
+!13 = distinct !{}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,217 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -mcpu=corei7-avx -S -vectorizer-min-trip-count=21 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+;
+; The source code for the test:
+;
+; void foo(float* restrict A, float* restrict B)
+; {
+;     for (int i = 0; i < 20; ++i) A[i] += B[i];
+; }
+;
+
+;
+; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata.
+;
+define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !4
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !11
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !11
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !11
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2, !{!"llvm.loop.parallel_accesses", !11}}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!11 = distinct !{}
+
+;
+; This loop will be vectorized as the trip count is below the threshold but no
+; scalar iterations are needed thanks to folding its tail.
+;
+define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP7]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP8]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13}}
+!13 = distinct !{}
+
+;
+; This loop will be vectorized as the trip count is below the threshold but no
+; scalar iterations are needed.
+;
+define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+
+!4 = !{!4}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,66 @@
+; RUN: opt < %s  -loop-vectorize -mattr=+sse4.2 -debug-only=loop-vectorize 2>&1 -S | FileCheck %s
+; REQUIRES: asserts
+; Make sure we use the right select kind when querying select costs.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+ at a = common global [2048 x i32] zeroinitializer, align 16
+ at b = common global [2048 x i32] zeroinitializer, align 16
+ at c = common global [2048 x i32] zeroinitializer, align 16
+
+; CHECK: Checking a loop in "scalarselect"
+define void @scalarselect(i1 %cond) {
+  br label %1
+
+; <label>:1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+
+; A scalar select has a cost of 1 on core2
+; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %cond, i32 %6, i32 0
+
+  %sel = select i1 %cond, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8
+  ret void
+}
+
+; CHECK: Checking a loop in "vectorselect"
+define void @vectorselect(i1 %cond) {
+  br label %1
+
+; <label>:1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %8 = icmp ult i64 %indvars.iv, 8
+
+; A vector select has a cost of 1 on core2
+; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %8, i32 %6, i32 0
+
+  %sel = select i1 %8, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,75 @@
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=core-avx2 -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX2
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at a = global [1000 x i8] zeroinitializer, align 16
+ at b = global [1000 x i8] zeroinitializer, align 16
+ at c = global [1000 x i8] zeroinitializer, align 16
+ at u = global [1000 x i32] zeroinitializer, align 16
+ at v = global [1000 x i32] zeroinitializer, align 16
+ at w = global [1000 x i32] zeroinitializer, align 16
+
+; Tests that the vectorization factor is determined by the smallest instead of
+; widest type in the loop for maximum bandwidth when
+; -vectorizer-maximize-bandwidth is indicated.
+;
+; CHECK-LABEL: foo
+; CHECK-AVX1: LV: Selecting VF: 16.
+; CHECK-AVX2: LV: Selecting VF: 32.
+define void @foo() {
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx6, align 1
+  %arrayidx8 = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %3, %2
+  %arrayidx13 = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 %indvars.iv
+  store i32 %add11, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; We should not choose a VF larger than the constant TC.
+; VF chosen should be atmost 16 (not the max possible vector width = 32 for AVX2)
+define void @not_too_small_tc(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: not_too_small_tc
+; CHECK-AVX2: LV: Selecting VF: 16.
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv
+  %l1 = load i8, i8* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  %l2 = load i8, i8* %arrayidx2, align 4, !llvm.access.group !13
+  %add = add i8 %l1, %l2
+  store i8 %add, i8* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13}}
+!4 = !{!4}
+!13 = distinct !{}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,150 @@
+; RUN: opt -basicaa -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%0 = type { %0*, %1 }
+%1 = type { i8*, i32 }
+
+ at p = global [2048 x [8 x i32*]] zeroinitializer, align 16
+ at q = global [2048 x i16] zeroinitializer, align 16
+ at r = global [2048 x i16] zeroinitializer, align 16
+
+; Tests for widest type
+; Ensure that we count the pointer store in the first test case. We have a
+; consecutive vector of pointers store, therefore we should count it towards the
+; widest vector count.
+;
+; CHECK: test_consecutive_store
+; CHECK: The Smallest and Widest types: 64 / 64 bits.
+define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 {
+  %4 = load %0*, %0** %2, align 8
+  %5 = icmp eq %0** %0, %1
+  br i1 %5, label %12, label %6
+
+; <label>:6                                       ; preds = %3
+  br label %7
+
+; <label>:7                                       ; preds = %7, %6
+  %8 = phi %0** [ %0, %6 ], [ %9, %7 ]
+  store %0* %4, %0** %8, align 8
+  %9 = getelementptr inbounds %0*, %0** %8, i64 1
+  %10 = icmp eq %0** %9, %1
+  br i1 %10, label %11, label %7
+
+; <label>:11                                      ; preds = %7
+  br label %12
+
+; <label>:12                                      ; preds = %11, %3
+  ret void
+}
+
+; However, if the store of a set of pointers is not to consecutive memory we do
+; NOT count the store towards the widest vector type.
+; In the test case below we add i16 types to store it in an array of pointer,
+; therefore the widest type should be i16.
+; int* p[2048][8];
+; short q[2048];
+;   for (int y = 0; y < 8; ++y)
+;     for (int i = 0; i < 1024; ++i) {
+;       p[i][y] = (int*) (1 + q[i]);
+;     }
+; CHECK: test_nonconsecutive_store
+; CHECK: The Smallest and Widest types: 16 / 16 bits.
+define void @test_nonconsecutive_store() nounwind ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %14, %0
+  %2 = phi i64 [ 0, %0 ], [ %15, %14 ]
+  br label %3
+
+; <label>:3                                       ; preds = %3, %1
+  %4 = phi i64 [ 0, %1 ], [ %11, %3 ]
+  %5 = getelementptr inbounds [2048 x i16], [2048 x i16]* @q, i64 0, i64 %4
+  %6 = load i16, i16* %5, align 2
+  %7 = sext i16 %6 to i64
+  %8 = add i64 %7, 1
+  %9 = inttoptr i64 %8 to i32*
+  %10 = getelementptr inbounds [2048 x [8 x i32*]], [2048 x [8 x i32*]]* @p, i64 0, i64 %4, i64 %2
+  store i32* %9, i32** %10, align 8
+  %11 = add i64 %4, 1
+  %12 = trunc i64 %11 to i32
+  %13 = icmp ne i32 %12, 1024
+  br i1 %13, label %3, label %14
+
+; <label>:14                                      ; preds = %3
+  %15 = add i64 %2, 1
+  %16 = trunc i64 %15 to i32
+  %17 = icmp ne i32 %16, 8
+  br i1 %17, label %1, label %18
+
+; <label>:18                                      ; preds = %14
+  ret void
+}
+
+
+ at ia = global [1024 x i32*] zeroinitializer, align 16
+ at ib = global [1024 x i32] zeroinitializer, align 16
+ at ic = global [1024 x i8] zeroinitializer, align 16
+ at p2 = global [2048 x [8 x i32*]] zeroinitializer, align 16
+ at q2 = global [2048 x i16] zeroinitializer, align 16
+
+;; Now we check the same rules for loads. We should take consecutive loads of
+;; pointer types into account.
+; CHECK: test_consecutive_ptr_load
+; CHECK: The Smallest and Widest types: 8 / 64 bits.
+define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %2 = phi i64 [ 0, %0 ], [ %10, %1 ]
+  %3 = phi i8 [ 0, %0 ], [ %9, %1 ]
+  %4 = getelementptr inbounds [1024 x i32*], [1024 x i32*]* @ia, i32 0, i64 %2
+  %5 = load i32*, i32** %4, align 4
+  %6 = ptrtoint i32* %5 to i64
+  %7 = trunc i64 %6 to i8
+  %8 = add i8 %3, 1
+  %9 = add i8 %7, %8
+  %10 = add i64 %2, 1
+  %11 = icmp ne i64 %10, 1024
+  br i1 %11, label %1, label %12
+
+; <label>:12                                      ; preds = %1
+  %13 = phi i8 [ %9, %1 ]
+  ret i8 %13
+}
+
+;; However, we should not take unconsecutive loads of pointers into account.
+; CHECK: test_nonconsecutive_ptr_load
+; CHECK: LV: The Smallest and Widest types: 16 / 16 bits.
+define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %13, %0
+  %2 = phi i64 [ 0, %0 ], [ %14, %13 ]
+  br label %3
+
+; <label>:3                                       ; preds = %3, %1
+  %4 = phi i64 [ 0, %1 ], [ %10, %3 ]
+  %5 = getelementptr inbounds [2048 x [8 x i32*]], [2048 x [8 x i32*]]* @p2, i64 0, i64 %4, i64 %2
+  %6 = getelementptr inbounds [2048 x i16], [2048 x i16]* @q2, i64 0, i64 %4
+  %7 = load i32*, i32** %5, align 2
+  %8 = ptrtoint i32* %7 to i64
+  %9 = trunc i64 %8 to i16
+  store i16 %9, i16* %6, align 8
+  %10 = add i64 %4, 1
+  %11 = trunc i64 %10 to i32
+  %12 = icmp ne i32 %11, 1024
+  br i1 %12, label %3, label %13
+
+; <label>:13                                      ; preds = %3
+  %14 = add i64 %2, 1
+  %15 = trunc i64 %14 to i32
+  %16 = icmp ne i32 %15, 8
+  br i1 %16, label %1, label %17
+
+; <label>:17                                      ; preds = %13
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,74 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
+
+; RUN: llc < %s -mtriple x86_64-pc-linux-gnu -o - | FileCheck -check-prefix=DEBUG-OUTPUT %s
+; DEBUG-OUTPUT-NOT: .loc
+; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
+
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
+; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
+; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @foo(i32 %n) #0 !dbg !4 {
+entry:
+  %diff = alloca i32, align 4
+  %cb = alloca [16 x i8], align 16
+  %cc = alloca [16 x i8], align 16
+  store i32 0, i32* %diff, align 4, !tbaa !11
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* %cb, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1, !tbaa !21
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [16 x i8], [16 x i8]* %cc, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1, !tbaa !21
+  %conv3 = sext i8 %1 to i32
+  %sub = sub i32 %conv, %conv3
+  %add = add nsw i32 %sub, %add8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !25
+
+for.end:                                          ; preds = %for.body
+  store i32 %add, i32* %diff, align 4, !tbaa !11
+  call void @ibar(i32* %diff) #2
+  ret i32 0
+}
+
+declare void @ibar(i32*) #1
+
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+!llvm.dbg.cu = !{!24}
+
+!1 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !24, scopeLine: 6, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0 "}
+!10 = !DILocation(line: 8, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 17, column: 8, scope: !16)
+!16 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !17)
+!17 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !18)
+!18 = distinct !DILexicalBlock(line: 17, column: 3, file: !1, scope: !4)
+!19 = !DILocation(line: 18, column: 5, scope: !20)
+!20 = distinct !DILexicalBlock(line: 17, column: 27, file: !1, scope: !18)
+!21 = !{!13, !13, i64 0}
+!22 = !DILocation(line: 20, column: 3, scope: !4)
+!23 = !DILocation(line: 21, column: 3, scope: !4)
+!24 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, emissionKind: NoDebug)
+!25 = !{!25, !15}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,313 @@
+; RUN: opt < %s -loop-vectorize -transform-warning -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -transform-warning -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+; RUN: opt < %s -passes=loop-vectorize,transform-warning -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize'  2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize,transform-warning -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+; C/C++ code for tests
+; void test(int *A, int Length) {
+; #pragma clang loop vectorize(enable) interleave(enable)
+;   for (int i = 0; i < Length; i++) {
+;     A[i] = i;
+;     if (A[i] > Length)
+;       break;
+;   }
+; }
+; File, line, and column should match those specified in the metadata
+; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations
+; CHECK: remark: source.cpp:4:5: loop not vectorized
+
+; void test_disabled(int *A, int Length) {
+; #pragma clang loop vectorize(disable) interleave(disable)
+;   for (int i = 0; i < Length; i++)
+;     A[i] = i;
+; }
+; CHECK: remark: source.cpp:13:5: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+
+; void test_array_bounds(int *A, int *B, int Length) {
+; #pragma clang loop vectorize(enable)
+;   for (int i = 0; i < Length; i++)
+;     A[i] = A[B[i]];
+; }
+; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds
+; CHECK: remark: source.cpp:19:5: loop not vectorized
+; CHECK: warning: source.cpp:19:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; int foo();
+; void test_multiple_failures(int *A) {
+;   int k = 0;
+; #pragma clang loop vectorize(enable) interleave(enable)
+;   for (int i = 0; i < 1000; i+=A[i]) {
+;     if (A[i])
+;       k = foo();
+;   }
+;   return k;
+; }
+; CHECK: remark: source.cpp:29:7: loop not vectorized: control flow cannot be substituted for a select
+; CHECK: remark: source.cpp:27:3: loop not vectorized
+
+; YAML:       --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantComputeNumberOfIterations
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 4, Column: 5 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          could not determine number of loop iterations
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 4, Column: 5 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            AllDisabled
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 13, Column: 5 }
+; YAML-NEXT: Function:        _Z13test_disabledPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            ''
+; YAML-NEXT: Name:            CantIdentifyArrayBounds
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          cannot identify array bounds
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT:   - String:          ' (Force='
+; YAML-NEXT:   - Force:           'true'
+; YAML-NEXT:   - String:          ')'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Failure
+; YAML-NEXT: Pass:            transform-warning
+; YAML-NEXT: Name:            FailedRequestedVectorization
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NoCFGForSelect
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 29, Column: 7 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          control flow cannot be substituted for a select
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NonReductionValueUsedOutsideLoop
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          value that could not be identified as reduction is used outside the loop
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantComputeNumberOfIterations
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          could not determine number of loop iterations
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z4testPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 {
+entry:
+  %cmp10 = icmp sgt i32 %Length, 0, !dbg !12
+  br i1 %cmp10, label %for.body, label %for.end, !dbg !12, !llvm.loop !14
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !16
+  %0 = trunc i64 %indvars.iv to i32, !dbg !16
+  %ld = load i32, i32* %arrayidx, align 4
+  store i32 %0, i32* %arrayidx, align 4, !dbg !16, !tbaa !18
+  %cmp3 = icmp sle i32 %ld, %Length, !dbg !22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !12
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, %Length, !dbg !12
+  %or.cond = and i1 %cmp3, %cmp, !dbg !22
+  br i1 %or.cond, label %for.body, label %for.end, !dbg !22
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !24
+}
+
+; CHECK: _Z4testPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z13test_disabledPii(i32* nocapture %A, i32 %Length) #0 !dbg !7 {
+entry:
+  %cmp4 = icmp sgt i32 %Length, 0, !dbg !25
+  br i1 %cmp4, label %for.body, label %for.end, !dbg !25, !llvm.loop !27
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !30
+  %0 = trunc i64 %indvars.iv to i32, !dbg !30
+  store i32 %0, i32* %arrayidx, align 4, !dbg !30, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !25
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !25
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !25
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !25, !llvm.loop !27
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !31
+}
+
+; CHECK: _Z13test_disabledPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) #0 !dbg !8 {
+entry:
+  %cmp9 = icmp sgt i32 %Length, 0, !dbg !32
+  br i1 %cmp9, label %for.body.preheader, label %for.end, !dbg !32, !llvm.loop !34
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !35
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv, !dbg !35
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !35, !tbaa !18
+  %idxprom1 = sext i32 %0 to i64, !dbg !35
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !35
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !35, !tbaa !18
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !35
+  store i32 %1, i32* %arrayidx4, align 4, !dbg !35, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !32
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !32
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !32
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !32, !llvm.loop !34
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !36
+}
+
+; CHECK: _Z17test_array_boundsPiS_i
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind uwtable
+define i32 @test_multiple_failures(i32* nocapture readonly %A) #0 !dbg !46 {
+entry:
+  br label %for.body, !dbg !38
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.09 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
+  %k.09 = phi i32 [ 0, %entry ], [ %k.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.09, !dbg !40
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !40
+  %tobool = icmp eq i32 %0, 0, !dbg !40
+  br i1 %tobool, label %for.inc, label %if.then, !dbg !40
+
+if.then:                                          ; preds = %for.body
+  %call = tail call i32 (...) @foo(), !dbg !41
+  %.pre = load i32, i32* %arrayidx, align 4
+  br label %for.inc, !dbg !42
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %1 = phi i32 [ %.pre, %if.then ], [ 0, %for.body ], !dbg !43
+  %k.1 = phi i32 [ %call, %if.then ], [ %k.09, %for.body ]
+  %add = add nsw i32 %1, %i.09, !dbg !44
+  %cmp = icmp slt i32 %add, 1000, !dbg !45
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !38
+
+for.cond.cleanup:                                 ; preds = %for.inc
+  ret i32 %k.1, !dbg !39
+}
+
+declare i32 @foo(...)
+
+; CHECK: test_multiple_failure
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "test_disabled", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 10, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!8 = distinct !DISubprogram(name: "test_array_bounds", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 16, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.5.0"}
+!12 = !DILocation(line: 3, column: 8, scope: !13)
+!13 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!14 = !{!14, !15, !15}
+!15 = !{!"llvm.loop.vectorize.enable", i1 true}
+!16 = !DILocation(line: 4, column: 5, scope: !17)
+!17 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !13)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 5, column: 9, scope: !23)
+!23 = distinct !DILexicalBlock(line: 5, column: 9, file: !1, scope: !17)
+!24 = !DILocation(line: 8, column: 1, scope: !4)
+!25 = !DILocation(line: 12, column: 8, scope: !26)
+!26 = distinct !DILexicalBlock(line: 12, column: 3, file: !1, scope: !7)
+!27 = !{!27, !28, !29}
+!28 = !{!"llvm.loop.interleave.count", i32 1}
+!29 = !{!"llvm.loop.vectorize.width", i32 1}
+!30 = !DILocation(line: 13, column: 5, scope: !26)
+!31 = !DILocation(line: 14, column: 1, scope: !7)
+!32 = !DILocation(line: 18, column: 8, scope: !33)
+!33 = distinct !DILexicalBlock(line: 18, column: 3, file: !1, scope: !8)
+!34 = !{!34, !15}
+!35 = !DILocation(line: 19, column: 5, scope: !33)
+!36 = !DILocation(line: 20, column: 1, scope: !8)
+!37 = distinct !DILexicalBlock(line: 24, column: 3, file: !1, scope: !46)
+!38 = !DILocation(line: 27, column: 3, scope: !37)
+!39 = !DILocation(line: 31, column: 3, scope: !37)
+!40 = !DILocation(line: 28, column: 9, scope: !37)
+!41 = !DILocation(line: 29, column: 11, scope: !37)
+!42 = !DILocation(line: 29, column: 7, scope: !37)
+!43 = !DILocation(line: 27, column: 32, scope: !37)
+!44 = !DILocation(line: 27, column: 30, scope: !37)
+!45 = !DILocation(line: 27, column: 21, scope: !37)
+!46 = distinct !DISubprogram(name: "test_multiple_failures", line: 26, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 26, file: !1, scope: !5, type: !6, retainedNodes: !2)

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,112 @@
+; RUN: opt < %s -loop-vectorize -pass-remarks-missed='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+
+; Verify analysis remarks are generated when interleaving is not beneficial.
+; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that vectorization is not beneficial
+; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that interleaving is not beneficial and is explicitly disabled or interleave count is set to 1
+; CHECK: remark: vectorization-remarks-profitable.c:12:17: the cost-model indicates that vectorization is not beneficial
+; CHECK: remark: vectorization-remarks-profitable.c:12:17: the cost-model indicates that interleaving is not beneficial
+
+; First loop.
+;  #pragma clang loop interleave(disable) unroll(disable)
+;  for(int i = 0; i < n; i++) {
+;    out[i] = *in[i];
+;  }
+
+; Second loop.
+;  #pragma clang loop unroll(disable)
+;  for(int i = 0; i < n; i++) {
+;    out[i] = *in[i];
+;  }
+
+; ModuleID = 'vectorization-remarks-profitable.ll'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind uwtable
+define void @do_not_interleave(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !4 {
+entry:
+  %cmp.4 = icmp eq i32 %size, 0, !dbg !10
+  br i1 %cmp.4, label %for.end, label %for.body.preheader, !dbg !11
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !12
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float*, float** %in, i64 %indvars.iv, !dbg !12
+  %0 = bitcast float** %arrayidx to i32**, !dbg !12
+  %1 = load i32*, i32** %0, align 8, !dbg !12
+  %2 = load i32, i32* %1, align 4, !dbg !13
+  %arrayidx2 = getelementptr inbounds float, float* %out, i64 %indvars.iv, !dbg !14
+  %3 = bitcast float* %arrayidx2 to i32*, !dbg !15
+  store i32 %2, i32* %3, align 4, !dbg !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !11
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !11
+  %exitcond = icmp eq i32 %lftr.wideiv, %size, !dbg !11
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !11, !llvm.loop !16
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end, !dbg !19
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !19
+}
+
+; Function Attrs: nounwind uwtable
+define void @interleave_not_profitable(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !6 {
+entry:
+  %cmp.4 = icmp eq i32 %size, 0, !dbg !20
+  br i1 %cmp.4, label %for.end, label %for.body, !dbg !21
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float*, float** %in, i64 %indvars.iv, !dbg !22
+  %0 = bitcast float** %arrayidx to i32**, !dbg !22
+  %1 = load i32*, i32** %0, align 8, !dbg !22
+  %2 = load i32, i32* %1, align 4, !dbg !23
+  %arrayidx2 = getelementptr inbounds float, float* %out, i64 %indvars.iv, !dbg !24
+  %3 = bitcast float* %arrayidx2 to i32*, !dbg !25
+  store i32 %2, i32* %3, align 4, !dbg !25
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
+  %exitcond = icmp eq i32 %lftr.wideiv, %size, !dbg !21
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !21, !llvm.loop !26
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !27
+}
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 250016)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "vectorization-remarks-profitable.c", directory: "")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "do_not_interleave", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!5 = !DISubroutineType(types: !2)
+!6 = distinct !DISubprogram(name: "interleave_not_profitable", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.8.0 (trunk 250016)"}
+!10 = !DILocation(line: 4, column: 23, scope: !4)
+!11 = !DILocation(line: 4, column: 3, scope: !4)
+!12 = !DILocation(line: 5, column: 17, scope: !4)
+!13 = !DILocation(line: 5, column: 16, scope: !4)
+!14 = !DILocation(line: 5, column: 7, scope: !4)
+!15 = !DILocation(line: 5, column: 14, scope: !4)
+!16 = distinct !{!16, !17, !18}
+!17 = !{!"llvm.loop.interleave.count", i32 1}
+!18 = !{!"llvm.loop.unroll.disable"}
+!19 = !DILocation(line: 6, column: 1, scope: !4)
+!20 = !DILocation(line: 11, column: 23, scope: !6)
+!21 = !DILocation(line: 11, column: 3, scope: !6)
+!22 = !DILocation(line: 12, column: 17, scope: !6)
+!23 = !DILocation(line: 12, column: 16, scope: !6)
+!24 = !DILocation(line: 12, column: 7, scope: !6)
+!25 = !DILocation(line: 12, column: 14, scope: !6)
+!26 = distinct !{!26, !18}
+!27 = !DILocation(line: 13, column: 1, scope: !6)
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,73 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
+
+; RUN: llc < %s -mtriple x86_64-pc-linux-gnu -o - | FileCheck -check-prefix=DEBUG-OUTPUT %s
+; DEBUG-OUTPUT-NOT: .loc
+; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
+
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
+; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
+; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @foo(i32 %n) #0 !dbg !4 {
+entry:
+  %diff = alloca i32, align 4
+  %cb = alloca [16 x i8], align 16
+  %cc = alloca [16 x i8], align 16
+  store i32 0, i32* %diff, align 4, !dbg !10, !tbaa !11
+  br label %for.body, !dbg !15
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ], !dbg !19
+  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* %cb, i64 0, i64 %indvars.iv, !dbg !19
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !19, !tbaa !21
+  %conv = sext i8 %0 to i32, !dbg !19
+  %arrayidx2 = getelementptr inbounds [16 x i8], [16 x i8]* %cc, i64 0, i64 %indvars.iv, !dbg !19
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !19, !tbaa !21
+  %conv3 = sext i8 %1 to i32, !dbg !19
+  %sub = sub i32 %conv, %conv3, !dbg !19
+  %add = add nsw i32 %sub, %add8, !dbg !19
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !15
+  %exitcond = icmp eq i64 %indvars.iv.next, 16, !dbg !15
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !15
+
+for.end:                                          ; preds = %for.body
+  store i32 %add, i32* %diff, align 4, !dbg !19, !tbaa !11
+  call void @ibar(i32* %diff) #2, !dbg !22
+  ret i32 0, !dbg !23
+}
+
+declare void @ibar(i32*) #1
+
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+!llvm.dbg.cu = !{!24}
+
+!1 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !24, scopeLine: 6, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0 "}
+!10 = !DILocation(line: 8, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 17, column: 8, scope: !16)
+!16 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !17)
+!17 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !18)
+!18 = distinct !DILexicalBlock(line: 17, column: 3, file: !1, scope: !4)
+!19 = !DILocation(line: 18, column: 5, scope: !20)
+!20 = distinct !DILexicalBlock(line: 17, column: 27, file: !1, scope: !18)
+!21 = !{!13, !13, i64 0}
+!22 = !DILocation(line: 20, column: 3, scope: !4)
+!23 = !DILocation(line: 21, column: 3, scope: !4)
+!24 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, emissionKind: NoDebug)

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define i32 @accum(i32* nocapture readonly %x, i32 %N) #0 {
+entry:
+; CHECK-LABEL: @accum
+; CHECK-NOT: x i32>
+
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.inc.preheader, label %for.end
+
+for.inc.preheader:
+  br label %for.inc
+
+for.inc:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.inc.preheader ]
+  %sum.02 = phi i32 [ %add, %for.inc ], [ 0, %for.inc.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.02
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.inc
+
+for.end.loopexit:
+  %add.lcssa = phi i32 [ %add, %for.inc ]
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %sum.0.lcssa
+
+; CHECK: ret i32
+}
+
+attributes #0 = { "target-cpu"="core2" "target-features"="+sse,-avx,-avx2,-sse2" }
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,826 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; When masked-interleaved-groups are disabled:
+; Check that the predicated load is not vectorized as an
+; interleaved-group but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+;  the only remaining alternative is to scalarize).
+; In this case a scalar epilogue is not needed.
+;
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load for an interleave-group (with
+; a single member).
+; Since the last (second) member of the load-group is a gap, peeling is used,
+; so we also expect to find a scalar epilogue loop.
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED: for.body:
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Exactly the same scenario except we are now optimizing for size, therefore
+; we check that no scalar epilogue is created. Since we can't create an epilog
+; we need the ability to mask out the gaps.
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads with the mask properly shuffled and
+; And-ed with the gaps mask.
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]])
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+; Accesses with gaps under Optsize scenario again, with unknown trip-count
+; this time, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during Legality checks; So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
+; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISABLED_MASKED_STRIDED-NOT:   for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED-NOT:   for.body:
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.010, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Same, with stride 3. This is to check the gaps-mask and the shuffled mask
+; with a different stride.
+; So accesses are with gaps under Optsize scenario again, with unknown trip-
+; count, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
+; void masked_strided3_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[3*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[INDEX]], 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <24 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x i1> [[TMP6]], <24 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+;
+define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = mul nsw i32 %ix.010, 3
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Back to stride 2 with gaps with a known trip count under opt for size,
+; but this time the load/store are not predicated. 
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks because we have gaps and we can't
+; create an epilog. The access is thus scalarized.
+; (Before the fix that this test checks, we used to create an epilogue despite
+; optsize, and vectorized the access as an interleaved-group. This is now fixed,
+; and we make sure that a scalar epilogue does not exist).
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads (masking away the gaps).
+;
+; void unconditional_strided1_optsize(const unsigned char* restrict p,
+;                                unsigned char* restrict q,
+;                                unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;DISABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  br label %for.body
+
+for.body:
+  %ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = shl nuw nsw i32 %ix.06, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.06, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+
+; Unconditioal accesses with gaps under Optsize scenario again, with unknown
+; trip-count this time, in order to check the behavior of folding-the-tail 
+; (folding the remainder loop into the main loop using masking) together with
+; interleaved-groups. Folding-the-tail turns the accesses to conditional which
+; requires proper masking. In addition we need to mask out the gaps (all
+; because we are not allowed to use an epilog due to optsize).
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks. So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The shuffled mask is also And-ed with the gaps mask.
+;
+;   for(ix=0; ix < n; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+
+; DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} 
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISBLED_MASKED_STRIDED-NOT:    for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED-NOT:   for.body:
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @unconditional_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl nuw nsw i32 %ix.07, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.07
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.07, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Check also a scenario with full interleave-groups (no gaps) as well as both
+; load and store groups. We check that when masked-interleave-group is disabled
+; the predicated loads (and stores) are not vectorized as an
+; interleaved-group but rather as four separate scalarized accesses.
+; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
+; the only remaining alternative is to scalarize).
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load/store for the two interleave-
+; groups.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:        %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+;ENABLED_MASKED_STRIDED:       call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr  {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Full groups again, this time checking an Optsize scenario, with unknown trip-
+; count, to check the behavior of folding-the-tail (folding the remainder loop
+; into the main loop using masking) together with interleaved-groups.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during Legality check, so nothing to check here.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+;
+; void masked_strided2_unknown_tc(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard,
+;                     int n) {
+; for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided2_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or i32 [[TMP1]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add <8 x i32> {{.*}}, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = icmp eq i32 {{.*}}, {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP13]], 
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp22 = icmp sgt i32 %n, 0
+  br i1 %cmp22, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.023 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp sgt i32 %ix.023, %guard
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.023, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx3, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx5 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx5, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx9 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx9, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.023, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Full groups under Optsize scenario again, with unknown trip-count, again in
+; order to check the behavior of folding-the-tail (folding the remainder loop
+; into the main loop using masking) together with interleaved-groups.
+; This time the accesses are not conditional, they become conditional only
+; due to tail folding.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during cost-model checks, so we check for no epilogue and
+; scalarized conditional accesses.
+; When masked-interleave-group is enabled we check for no epilogue,
+; and interleave-groups vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; (Same vectorization scheme as for the previous loop with conditional accesses
+; except here the mask only masks away the remainder iterations.)
+;
+; void unconditional_masked_strided2_unknown_tc(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     int n) {
+; for(ix=0; ix < n; ++ix) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+; }
+;}
+
+; DISABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISABLED_MASKED_STRIDED-NOT:   for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> {{.*}}, {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = or i32 [[TMP0]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i32 {{.*}}, {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP11]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp20 = icmp sgt i32 %n, 0
+  br i1 %cmp20, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.021 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl nuw nsw i32 %ix.021, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx2 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx2, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx4 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx4, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx8 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx8, align 1
+  %inc = add nuw nsw i32 %ix.021, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/x86-pr39099.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/x86-pr39099.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/x86-pr39099.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86-pr39099.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,60 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s 
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; This test checks the fix for PR39099.
+;
+; Check that the predicated load is not vectorized as an
+; interleaved-group (which requires proper masking, currently unsupported)
+; but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+;  the only remaining alternative is to scalarize).
+;
+; void masked_strided(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+;CHECK-LABEL: @masked_strided(
+;CHECK: vector.body:
+;CHECK-NEXT:  %index = phi i32 
+;CHECK-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;CHECK-NEXT:  %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;CHECK-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;CHECK-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;CHECK-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;CHECK-NOT:   %[[WIDEVEC:.+]] = load <16 x i8>, <16 x i8>* %{{.*}}, align 1
+;CHECK-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+define dso_local void @masked_strided(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,98 @@
+; RUN: opt < %s -mattr=avx -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -simplifycfg -S | FileCheck %s
+; RUN: opt -mcpu=skylake-avx512 -S -force-vector-width=8 -force-vector-interleave=1 -loop-vectorize < %s | FileCheck %s --check-prefix=SINK-GATHER
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: predicated_sdiv_masked_load
+;
+; This test ensures that we don't scalarize the predicated load. Since the load
+; can be vectorized with predication, scalarizing it would cause its pointer
+; operand to become non-uniform.
+;
+; CHECK: vector.body:
+; CHECK:   %wide.masked.load = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32
+; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK:   %[[T0:.+]] = extractelement <2 x i32> %wide.masked.load, i32 0
+; CHECK:   %[[T1:.+]] = sdiv i32 %[[T0]], %x
+; CHECK:   %[[T2:.+]] = insertelement <2 x i32> undef, i32 %[[T1]], i32 0
+; CHECK:   br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK:   %[[T3:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T2]], %[[IF0]] ]
+; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK:   %[[T4:.+]] = extractelement <2 x i32> %wide.masked.load, i32 1
+; CHECK:   %[[T5:.+]] = sdiv i32 %[[T4]], %x
+; CHECK:   %[[T6:.+]] = insertelement <2 x i32> %[[T3]], i32 %[[T5]], i32 1
+; CHECK:   br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK:   phi <2 x i32> [ %[[T3]], %[[CONT0]] ], [ %[[T6]], %[[IF1]] ]
+; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
+
+define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp7, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp2 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp3 = load i32, i32* %tmp2, align 4
+  %tmp4 = sdiv i32 %tmp3, %x
+  %tmp5 = add nsw i32 %tmp4, %tmp1
+  br label %for.inc
+
+for.inc:
+  %tmp6 = phi i32 [ %tmp1, %for.body ], [ %tmp5, %if.then]
+  %tmp7 = add i32 %r, %tmp6
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 10000
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp8 = phi i32 [ %tmp7, %for.inc ]
+  ret i32 %tmp8
+}
+
+; This test ensures that a load, which would have been widened otherwise is
+; instead scalarized if Cost-Model so decided as part of its
+; sink-scalar-operands optimization for predicated instructions.
+;
+; SINK-GATHER: vector.body:
+; SINK-GATHER: pred.udiv.if:
+; SINK-GATHER:   %[[T0:.+]] = load i32, i32* %{{.*}}, align 4
+; SINK-GATHER:   %{{.*}} = udiv i32 %[[T0]], %{{.*}}
+; SINK-GATHER: pred.udiv.continue:
+define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %i7 = mul i64 %i, 777
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i7
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp4 = udiv i32 %tmp2, %x
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %x, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt -O3 -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+ at x = common global [1024 x x86_fp80] zeroinitializer, align 16
+
+;CHECK-LABEL: @example(
+;CHECK-NOT: bitcast x86_fp80* {{%[^ ]+}} to <{{[2-9][0-9]*}} x x86_fp80>*
+;CHECK: store
+;CHECK: ret void
+
+define void @example() nounwind ssp uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sitofp i32 1 to x86_fp80
+  %arrayidx = getelementptr inbounds [1024 x x86_fp80], [1024 x x86_fp80]* @x, i64 0, i64 %indvars.iv
+  store x86_fp80 %conv, x86_fp80* %arrayidx, align 16
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/XCore/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/XCore/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/XCore/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/XCore/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'XCore' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S -mtriple=xcore | FileCheck %s
+
+target datalayout = "e-p:32:32:32-a0:0:32-n32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f16:16:32-f32:32:32-f64:32:32"
+target triple = "xcore"
+; The xcore target has no vector registers, so loop should not be vectorized.
+;CHECK-LABEL: @f(
+;CHECK: entry:
+;CHECK-NOT: vector.body
+;CHECK-NEXT: br label %do.body
+define void @f(i8* nocapture %ptr, i32 %len) {
+entry:
+  br label %do.body
+do.body:
+  %ptr.addr.0 = phi i8* [ %ptr, %entry ], [ %incdec.ptr, %do.body ]
+  %len.addr.0 = phi i32 [ %len, %entry ], [ %dec, %do.body ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.0, i32 1
+  store i8 0, i8* %ptr.addr.0, align 1
+  %dec = add nsw i32 %len.addr.0, -1
+  %tobool = icmp eq i32 %len.addr.0, 0
+  br i1 %tobool, label %do.end, label %do.body
+do.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/align.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/align.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/align.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/align.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we output the abi alignment if no alignment is specified.
+
+;CHECK-LABEL: @align
+;CHECK: load <4 x i32>, <4 x i32>* {{.*}} align  4
+;CHECK: load <4 x i32>, <4 x i32>* {{.*}} align  4
+;CHECK: store <4 x i32> {{.*}} align  4
+
+define void @align(i32* %a, i32* %b, i32* %c) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %3 = load i32, i32* %2
+  %4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %5 = load i32, i32* %4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %6, i32* %7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 128 
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/bsd_regex.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/bsd_regex.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/bsd_regex.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/bsd_regex.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-interleave=2 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;PR 15830.
+
+;CHECK-LABEL: @foo(
+; When scalarizing stores we need to preserve the original order.
+; Make sure that we are extracting in the correct order (0101, and not 0011).
+;CHECK: extractelement <2 x i64> {{.*}}, i32 0
+;CHECK: extractelement <2 x i64> {{.*}}, i32 1
+;CHECK: extractelement <2 x i64> {{.*}}, i32 0
+;CHECK: extractelement <2 x i64> {{.*}}, i32 1
+;CHECK: store
+;CHECK: store
+;CHECK: store
+;CHECK: store
+;CHECK: ret
+
+define i32 @foo(i32* nocapture %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0
+  store i32 4, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
+
+

Added: llvm/trunk/test/Transforms/LoopVectorize/bzip_reverse_loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/bzip_reverse_loops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/bzip_reverse_loops.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/bzip_reverse_loops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,65 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK: fc
+;CHECK: load <4 x i16>
+;CHECK-NEXT: shufflevector <4 x i16>
+;CHECK: select <4 x i1>
+;CHECK: store <4 x i16>
+;CHECK: ret
+define void @fc(i16* nocapture %p, i32 %n, i32 %size) nounwind uwtable ssp {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %cond.end, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %cond.end ]
+  %p.addr.0 = phi i16* [ %p, %entry ], [ %incdec.ptr, %cond.end ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %p.addr.0, i64 -1
+  %0 = load i16, i16* %incdec.ptr, align 2
+  %conv = zext i16 %0 to i32
+  %cmp = icmp ult i32 %conv, %size
+  br i1 %cmp, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %do.body
+  %sub = sub i32 %conv, %size
+  %phitmp = trunc i32 %sub to i16
+  br label %cond.end
+
+cond.end:                                         ; preds = %do.body, %cond.true
+  %cond = phi i16 [ %phitmp, %cond.true ], [ 0, %do.body ]
+  store i16 %cond, i16* %incdec.ptr, align 2
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %cond.end
+  ret void
+}
+
+;CHECK: example1
+;CHECK: load <4 x i32>
+;CHECK-NEXT: shufflevector <4 x i32>
+;CHECK: select <4 x i1>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define void @example1(i32* nocapture %a, i32 %n, i32 %wsize) nounwind uwtable ssp {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.body ]
+  %p.0 = phi i32* [ %a, %entry ], [ %incdec.ptr, %do.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %p.0, i64 -1
+  %0 = load i32, i32* %incdec.ptr, align 4
+  %cmp = icmp slt i32 %0, %wsize
+  %sub = sub nsw i32 %0, %wsize
+  %cond = select i1 %cmp, i32 0, i32 %sub
+  store i32 %cond, i32* %incdec.ptr, align 4
+  %dec = add nsw i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/calloc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/calloc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/calloc.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/calloc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,49 @@
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK: hexit
+;CHECK: zext <4 x i8>
+;CHECK: ret
+
+define noalias i8* @hexit(i8* nocapture %bytes, i64 %length) nounwind uwtable ssp {
+entry:
+  %shl = shl i64 %length, 1
+  %add28 = or i64 %shl, 1
+  %call = tail call i8* @calloc(i64 1, i64 %add28) nounwind
+  %cmp29 = icmp eq i64 %shl, 0
+  br i1 %cmp29, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = shl i64 %length, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.030 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %shr = lshr i64 %i.030, 1
+  %arrayidx = getelementptr inbounds i8, i8* %bytes, i64 %shr
+  %1 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %1 to i32
+  %and = shl i64 %i.030, 2
+  %neg = and i64 %and, 4
+  %and3 = xor i64 %neg, 4
+  %sh_prom = trunc i64 %and3 to i32
+  %shl4 = shl i32 15, %sh_prom
+  %and5 = and i32 %conv, %shl4
+  %shr11 = lshr i32 %and5, %sh_prom
+  %conv13 = and i32 %shr11, 254
+  %cmp15 = icmp ugt i32 %conv13, 9
+  %cond = select i1 %cmp15, i32 87, i32 48
+  %add17 = add nsw i32 %cond, %shr11
+  %conv18 = trunc i32 %add17 to i8
+  %arrayidx19 = getelementptr inbounds i8, i8* %call, i64 %i.030
+  store i8 %conv18, i8* %arrayidx19, align 1
+  %inc = add i64 %i.030, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i8* %call
+}
+
+declare noalias i8* @calloc(i64, i64) nounwind

Added: llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/cast-induction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; rdar://problem/12848162
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @example12(
+;CHECK: %vec.ind1 = phi <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example12() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/conditional-assignment.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/conditional-assignment.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/conditional-assignment.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/conditional-assignment.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,57 @@
+; RUN: opt < %s -enable-cond-stores-vec=false -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+; RUN: opt < %s -enable-cond-stores-vec=false -passes=loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+
+; CHECK: remark: source.c:2:8: the cost-model indicates that vectorization is not beneficial
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind ssp uwtable
+define void @conditional_store(i32* noalias nocapture %indices) #0 !dbg !4 {
+entry:
+  br label %for.body, !dbg !10
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %indices, i64 %indvars.iv, !dbg !12
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !12, !tbaa !14
+  %cmp1 = icmp eq i32 %0, 1024, !dbg !12
+  br i1 %cmp1, label %if.then, label %for.inc, !dbg !12
+
+if.then:                                          ; preds = %for.body
+  store i32 0, i32* %arrayidx, align 4, !dbg !18, !tbaa !14
+  br label %for.inc, !dbg !18
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096, !dbg !10
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !10
+
+for.end:                                          ; preds = %for.inc
+  ret void, !dbg !19
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0", isOptimized: true, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.c", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "conditional_store", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.c", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.6.0"}
+!10 = !DILocation(line: 2, column: 8, scope: !11)
+!11 = distinct !DILexicalBlock(line: 2, column: 3, file: !1, scope: !4)
+!12 = !DILocation(line: 3, column: 9, scope: !13)
+!13 = distinct !DILexicalBlock(line: 3, column: 9, file: !1, scope: !11)
+!14 = !{!15, !15, i64 0}
+!15 = !{!"int", !16, i64 0}
+!16 = !{!"omnipotent char", !17, i64 0}
+!17 = !{!"Simple C/C++ TBAA"}
+!18 = !DILocation(line: 3, column: 29, scope: !13)
+!19 = !DILocation(line: 4, column: 1, scope: !4)

Added: llvm/trunk/test/Transforms/LoopVectorize/consec_no_gep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/consec_no_gep.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/consec_no_gep.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/consec_no_gep.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,42 @@
+;RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;; Check consecutive memory access without preceding GEP instruction
+
+;  for (int i=0; i<len; i++) {
+;    *to++ = *from++;
+;  }
+
+; CHECK-LABEL: @consecutive_no_gep(
+; CHECK: vector.body
+; CHECK: %[[index:.*]] = phi i64 [ 0, %vector.ph ]
+; CHECK: getelementptr float, float* %{{.*}}, i64 %[[index]]
+; CHECK: load <4 x float>
+
+define void @consecutive_no_gep(float* noalias nocapture readonly %from, float* noalias nocapture %to, i32 %len) #0 {
+entry:
+  %cmp2 = icmp sgt i32 %len, 0
+  br i1 %cmp2, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %from.addr.04 = phi float* [ %incdec.ptr, %for.body ], [ %from, %for.body.preheader ]
+  %to.addr.03 = phi float* [ %incdec.ptr1, %for.body ], [ %to, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds float, float* %from.addr.04, i64 1
+  %val = load float, float* %from.addr.04, align 4
+  %incdec.ptr1 = getelementptr inbounds float, float* %to.addr.03, i64 1
+  store float %val, float* %to.addr.03, align 4
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %inc, %len
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,490 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s --check-prefix=INTER
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+%pair = type { i32, i32 }
+
+; CHECK-LABEL: consecutive_ptr_forward
+;
+; Check that a forward consecutive pointer is recognized as uniform and remains
+; uniform after vectorization.
+;
+; CHECK:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NOT:   getelementptr
+; CHECK:       getelementptr inbounds i32, i32* %a, i64 %index
+; CHECK-NOT:   getelementptr
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @consecutive_ptr_forward(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; CHECK-LABEL: consecutive_ptr_reverse
+;
+; Check that a reverse consecutive pointer is recognized as uniform and remains
+; uniform after vectorization.
+;
+; CHECK:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %offset.idx = sub i64 %n, %index
+; CHECK-NOT:   getelementptr
+; CHECK:       %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 -3
+; CHECK:       getelementptr inbounds i32, i32* %[[G0]], i64 %offset.idx
+; CHECK-NOT:   getelementptr
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @consecutive_ptr_reverse(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ %n, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  %i.next = add nuw nsw i64 %i, -1
+  %cond = icmp sgt i64 %i.next, 0
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; CHECK-LABEL: interleaved_access_forward
+; INTER-LABEL: interleaved_access_forward
+;
+; Check that a consecutive-like pointer used by a forward interleaved group is
+; recognized as uniform and remains uniform after vectorization. When
+; interleaved memory accesses aren't enabled, the pointer should not be
+; recognized as uniform, and it should not be uniform after vectorization.
+;
+; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; CHECK-NOT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %[[I1:.+]] = or i64 %index, 1
+; CHECK:       %[[I2:.+]] = or i64 %index, 2
+; CHECK:       %[[I3:.+]] = or i64 %index, 3
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %index, i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 1
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+; INTER:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; INTER:     LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; INTER:     vector.body
+; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTER-NOT:   getelementptr
+; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
+; INTER-NOT:   getelementptr
+; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @interleaved_access_forward(%pair* %p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp6, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %tmp3 = load i32, i32* %tmp1, align 8
+  %tmp4 = load i32, i32* %tmp2, align 8
+  %tmp5 = add i32 %tmp3, %tmp4
+  %tmp6 = add i32 %tmp0, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp14 = phi i32 [ %tmp6, %for.body ]
+  ret i32 %tmp14
+}
+
+; CHECK-LABEL: interleaved_access_reverse
+; INTER-LABEL: interleaved_access_reverse
+;
+; Check that a consecutive-like pointer used by a reverse interleaved group is
+; recognized as uniform and remains uniform after vectorization. When
+; interleaved memory accesses aren't enabled, the pointer should not be
+; recognized as uniform, and it should not be uniform after vectorization.
+;
+; recognized as uniform, and it should not be uniform after vectorization.
+; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; CHECK-NOT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %offset.idx = sub i64 %n, %index
+; CHECK:       %[[I1:.+]] = add i64 %offset.idx, -1
+; CHECK:       %[[I2:.+]] = add i64 %offset.idx, -2
+; CHECK:       %[[I3:.+]] = add i64 %offset.idx, -3
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 1
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+; INTER:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; INTER:     LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; INTER:     vector.body
+; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTER:       %offset.idx = sub i64 %n, %index
+; INTER-NOT:   getelementptr
+; INTER:       %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 0
+; INTER:       getelementptr inbounds i32, i32* %[[G0]], i64 -6
+; INTER-NOT:   getelementptr
+; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @interleaved_access_reverse(%pair* %p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ %n, %entry ]
+  %tmp0 = phi i32 [ %tmp6, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %tmp3 = load i32, i32* %tmp1, align 8
+  %tmp4 = load i32, i32* %tmp2, align 8
+  %tmp5 = add i32 %tmp3, %tmp4
+  %tmp6 = add i32 %tmp0, %tmp5
+  %i.next = add nuw nsw i64 %i, -1
+  %cond = icmp sgt i64 %i.next, 0
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp14 = phi i32 [ %tmp6, %for.body ]
+  ret i32 %tmp14
+}
+
+; INTER-LABEL: predicated_store
+;
+; Check that a consecutive-like pointer used by a forward interleaved group and
+; scalarized store is not recognized as uniform and is not uniform after
+; vectorization. The store is scalarized because it's in a predicated block.
+; Even though the load in this example is vectorized and only uses the pointer
+; as if it were uniform, the store is scalarized, making the pointer
+; non-uniform.
+;
+; INTER-NOT: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; INTER:     vector.body
+; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, {{.*}} ]
+; INTER:       %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
+; INTER:       %[[B0:.+]] = bitcast i32* %[[G0]] to <8 x i32>*
+; INTER:       %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 8
+; INTER:       %[[I1:.+]] = or i64 %index, 1
+; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
+; INTER:       %[[I2:.+]] = or i64 %index, 2
+; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
+; INTER:       %[[I3:.+]] = or i64 %index, 3
+; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
+; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @predicated_store(%pair *%p, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %tmp1 = load i32, i32* %tmp0, align 8
+  %tmp2 = icmp eq i32 %tmp1, %x
+  br i1 %tmp2, label %if.then, label %if.merge
+
+if.then:
+  store i32 %tmp1, i32* %tmp0, align 8
+  br label %if.merge
+
+if.merge:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: irregular_type
+;
+; Check that a consecutive pointer used by a scalarized store is not recognized
+; as uniform and is not uniform after vectorization. The store is scalarized
+; because the stored type may required padding.
+;
+; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %i
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %[[I1:.+]] = or i64 %index, 1
+; CHECK:       %[[I2:.+]] = or i64 %index, 2
+; CHECK:       %[[I3:.+]] = or i64 %index, 3
+; CHECK:       getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %index
+; CHECK:       getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %[[I1]]
+; CHECK:       getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %[[I2]]
+; CHECK:       getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %[[I3]]
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @irregular_type(x86_fp80* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = sitofp i32 1 to x86_fp80
+  %tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %i
+  store x86_fp80 %tmp0, x86_fp80* %tmp1, align 16
+  %i.next = add i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: pointer_iv_uniform
+;
+; Check that a pointer induction variable is recognized as uniform and remains
+; uniform after vectorization.
+;
+; CHECK:     LV: Found uniform instruction: %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NOT:   getelementptr
+; CHECK:       %next.gep = getelementptr i32, i32* %a, i64 %index
+; CHECK-NOT:   getelementptr
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @pointer_iv_uniform(i32* %a, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+  store i32 %x, i32* %p, align 8
+  %tmp03 = getelementptr inbounds i32, i32* %p, i32 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; INTER-LABEL: pointer_iv_non_uniform_0
+;
+; Check that a pointer induction variable with a non-uniform user is not
+; recognized as uniform and is not uniform after vectorization. The pointer
+; induction variable is used by getelementptr instructions that are non-uniform
+; due to scalarization of the stores.
+;
+; INTER-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+; INTER:     vector.body
+; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTER:       %[[I0:.+]] = shl i64 %index, 2
+; INTER:       %next.gep = getelementptr i32, i32* %a, i64 %[[I0]]
+; INTER:       %[[S1:.+]] = shl i64 %index, 2
+; INTER:       %[[I1:.+]] = or i64 %[[S1]], 4
+; INTER:       %next.gep2 = getelementptr i32, i32* %a, i64 %[[I1]]
+; INTER:       %[[S2:.+]] = shl i64 %index, 2
+; INTER:       %[[I2:.+]] = or i64 %[[S2]], 8
+; INTER:       %next.gep3 = getelementptr i32, i32* %a, i64 %[[I2]]
+; INTER:       %[[S3:.+]] = shl i64 %index, 2
+; INTER:       %[[I3:.+]] = or i64 %[[S3]], 12
+; INTER:       %next.gep4 = getelementptr i32, i32* %a, i64 %[[I3]]
+; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @pointer_iv_non_uniform_0(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+  %tmp00 = load i32, i32* %p, align 8
+  %tmp01 = getelementptr inbounds i32, i32* %p, i32 1
+  %tmp02 = load i32, i32* %tmp01, align 8
+  %tmp03 = getelementptr inbounds i32, i32* %p, i32 4
+  %tmp04 = load i32, i32* %tmp03, align 8
+  %tmp05 = getelementptr inbounds i32, i32* %p, i32 5
+  %tmp06 = load i32, i32* %tmp05, align 8
+  %tmp07 = sub i32 %tmp04, %tmp00
+  %tmp08 = sub i32 %tmp02, %tmp02
+  %tmp09 = getelementptr inbounds i32, i32* %p, i32 2
+  store i32 %tmp07, i32* %tmp09, align 8
+  %tmp10 = getelementptr inbounds i32, i32* %p, i32 3
+  store i32 %tmp08, i32* %tmp10, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: pointer_iv_non_uniform_1
+;
+; Check that a pointer induction variable with a non-uniform user is not
+; recognized as uniform and is not uniform after vectorization. The pointer
+; induction variable is used by a store that will be scalarized.
+;
+; CHECK-NOT: LV: Found uniform instruction: %p = phi x86_fp80* [%tmp1, %for.body], [%a, %entry]
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %next.gep = getelementptr x86_fp80, x86_fp80* %a, i64 %index
+; CHECK:       %[[I1:.+]] = or i64 %index, 1
+; CHECK:       %next.gep2 = getelementptr x86_fp80, x86_fp80* %a, i64 %[[I1]]
+; CHECK:       %[[I2:.+]] = or i64 %index, 2
+; CHECK:       %next.gep3 = getelementptr x86_fp80, x86_fp80* %a, i64 %[[I2]]
+; CHECK:       %[[I3:.+]] = or i64 %index, 3
+; CHECK:       %next.gep4 = getelementptr x86_fp80, x86_fp80* %a, i64 %[[I3]]
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @pointer_iv_non_uniform_1(x86_fp80* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi x86_fp80* [%tmp1, %for.body], [%a, %entry]
+  %tmp0 = sitofp i32 1 to x86_fp80
+  store x86_fp80 %tmp0, x86_fp80* %p, align 16
+  %tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %p, i32 1
+  %i.next = add i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: pointer_iv_mixed
+;
+; Check multiple pointer induction variables where only one is recognized as
+; uniform and remains uniform after vectorization. The other pointer induction
+; variable is not recognized as uniform and is not uniform after vectorization
+; because it is stored to memory.
+;
+; CHECK-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
+; CHECK:     LV: Found uniform instruction: %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %next.gep = getelementptr i32, i32* %a, i64 %index
+; CHECK:       %[[I1:.+]] = or i64 %index, 1
+; CHECK:       %next.gep10 = getelementptr i32, i32* %a, i64 %[[I1]]
+; CHECK:       %[[I2:.+]] = or i64 %index, 2
+; CHECK:       %next.gep11 = getelementptr i32, i32* %a, i64 %[[I2]]
+; CHECK:       %[[I3:.+]] = or i64 %index, 3
+; CHECK:       %next.gep12 = getelementptr i32, i32* %a, i64 %[[I3]]
+; CHECK:       %[[V0:.+]] = insertelement <4 x i32*> undef, i32* %next.gep, i32 0
+; CHECK:       %[[V1:.+]] = insertelement <4 x i32*> %[[V0]], i32* %next.gep10, i32 1
+; CHECK:       %[[V2:.+]] = insertelement <4 x i32*> %[[V1]], i32* %next.gep11, i32 2
+; CHECK:       %[[V3:.+]] = insertelement <4 x i32*> %[[V2]], i32* %next.gep12, i32 3
+; CHECK-NOT:   getelementptr
+; CHECK:       %next.gep13 = getelementptr i32*, i32** %b, i64 %index
+; CHECK-NOT:   getelementptr
+; CHECK:       %[[B0:.+]] = bitcast i32** %next.gep13 to <4 x i32*>*
+; CHECK:       store <4 x i32*> %[[V3]], <4 x i32*>* %[[B0]], align 8
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @pointer_iv_mixed(i32* %a, i32** %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
+  %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
+  %tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ]
+  %tmp1 = load i32, i32* %p, align 8
+  %tmp2 = add i32 %tmp1, %tmp0
+  store i32* %p, i32** %q, align 8
+  %tmp3 = getelementptr inbounds i32, i32* %p, i32 1
+  %tmp4 = getelementptr inbounds i32*, i32** %q, i32 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp5 = phi i32 [ %tmp2, %for.body ]
+  ret i32 %tmp5
+}
+
+; INTER-LABEL: bitcast_pointer_operand
+;
+; Check that a pointer operand having a user other than a memory access is
+; recognized as uniform after vectorization. In this test case, %tmp1 is a
+; bitcast that is used by a load and a getelementptr instruction (%tmp2). Once
+; %tmp2 is marked uniform, %tmp1 should be marked uniform as well.
+;
+; INTER:       LV: Found uniform instruction: %cond = icmp slt i64 %i.next, %n
+; INTER-NEXT:  LV: Found uniform instruction: %tmp2 = getelementptr inbounds i8, i8* %tmp1, i64 3
+; INTER-NEXT:  LV: Found uniform instruction: %tmp6 = getelementptr inbounds i8, i8* %B, i64 %i
+; INTER-NEXT:  LV: Found uniform instruction: %tmp1 = bitcast i64* %tmp0 to i8*
+; INTER-NEXT:  LV: Found uniform instruction: %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i
+; INTER-NEXT:  LV: Found uniform instruction: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+; INTER-NEXT:  LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 1
+; INTER:       vector.body:
+; INTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; INTER-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* %A, i64 [[INDEX]]
+; INTER-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <32 x i8>*
+; INTER-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i8>, <32 x i8>* [[TMP5]], align 1
+; INTER-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+; INTER-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
+; INTER-NEXT:    [[TMP6:%.*]] = xor <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]]
+; INTER-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* %B, i64 [[INDEX]]
+; INTER-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
+; INTER-NEXT:    store <4 x i8> [[TMP6]], <4 x i8>* [[TMP8]], align 1
+; INTER-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; INTER:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @bitcast_pointer_operand(i64* %A, i8* %B, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i
+  %tmp1 = bitcast i64* %tmp0 to i8*
+  %tmp2 = getelementptr inbounds i8, i8* %tmp1, i64 3
+  %tmp3 = load i8, i8* %tmp2, align 1
+  %tmp4 = load i8, i8* %tmp1, align 1
+  %tmp5 = xor i8 %tmp3, %tmp4
+  %tmp6 = getelementptr inbounds i8, i8* %B, i64 %i
+  store i8 %tmp5, i8* %tmp6
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/control-flow.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/control-flow.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/control-flow.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/control-flow.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,77 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S -pass-remarks-missed='loop-vectorize' 2>&1 | FileCheck %s
+
+; C/C++ code for control flow test
+; int test(int *A, int Length) {
+;   for (int i = 0; i < Length; i++) {
+;     if (A[i] > 10.0) goto end;
+;     A[i] = 0;
+;   }
+; end:
+;   return 0;
+; }
+
+; CHECK: remark: source.cpp:5:9: loop not vectorized: loop control flow is not understood by vectorizer
+; CHECK: remark: source.cpp:5:9: loop not vectorized
+
+; CHECK: _Z4testPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define i32 @_Z4testPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 {
+entry:
+  %cmp8 = icmp sgt i32 %Length, 0, !dbg !10
+  br i1 %cmp8, label %for.body.preheader, label %end, !dbg !10
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !12
+
+for.body:                                         ; preds = %for.body.preheader, %if.else
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.else ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !12
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !12, !tbaa !15
+  %cmp1 = icmp sgt i32 %0, 10, !dbg !12
+  br i1 %cmp1, label %end.loopexit, label %if.else, !dbg !12
+
+if.else:                                          ; preds = %for.body
+  store i32 0, i32* %arrayidx, align 4, !dbg !19, !tbaa !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %1 = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %cmp = icmp slt i32 %1, %Length, !dbg !10
+  br i1 %cmp, label %for.body, label %end.loopexit, !dbg !10
+
+end.loopexit:                                     ; preds = %if.else, %for.body
+  br label %end
+
+end:                                              ; preds = %end.loopexit, %entry
+  ret i32 0, !dbg !20
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0"}
+!10 = !DILocation(line: 3, column: 8, scope: !11)
+!11 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!12 = !DILocation(line: 5, column: 9, scope: !13)
+!13 = distinct !DILexicalBlock(line: 5, column: 9, file: !1, scope: !14)
+!14 = distinct !DILexicalBlock(line: 4, column: 3, file: !1, scope: !11)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !17, i64 0}
+!17 = !{!"omnipotent char", !18, i64 0}
+!18 = !{!"Simple C/C++ TBAA"}
+!19 = !DILocation(line: 8, column: 7, scope: !13)
+!20 = !DILocation(line: 12, column: 3, scope: !4)

Added: llvm/trunk/test/Transforms/LoopVectorize/cpp-new-array.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/cpp-new-array.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/cpp-new-array.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/cpp-new-array.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @cpp_new_arrays(
+;CHECK: sext i32
+;CHECK: load <4 x float>
+;CHECK: fadd <4 x float>
+;CHECK: ret i32
+define i32 @cpp_new_arrays() uwtable ssp {
+entry:
+  %call = call noalias i8* @_Znwm(i64 4)
+  %0 = bitcast i8* %call to float*
+  store float 1.000000e+03, float* %0, align 4
+  %call1 = call noalias i8* @_Znwm(i64 4)
+  %1 = bitcast i8* %call1 to float*
+  store float 1.000000e+03, float* %1, align 4
+  %call3 = call noalias i8* @_Znwm(i64 4)
+  %2 = bitcast i8* %call3 to float*
+  store float 1.000000e+03, float* %2, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.01 to i64
+  %arrayidx = getelementptr inbounds float, float* %0, i64 %idxprom
+  %3 = load float, float* %arrayidx, align 4
+  %idxprom5 = sext i32 %i.01 to i64
+  %arrayidx6 = getelementptr inbounds float, float* %1, i64 %idxprom5
+  %4 = load float, float* %arrayidx6, align 4
+  %add = fadd float %3, %4
+  %idxprom7 = sext i32 %i.01 to i64
+  %arrayidx8 = getelementptr inbounds float, float* %2, i64 %idxprom7
+  store float %add, float* %arrayidx8, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %5 = load float, float* %2, align 4
+  %conv10 = fptosi float %5 to i32
+  ret i32 %conv10
+}
+
+declare noalias i8* @_Znwm(i64)

Added: llvm/trunk/test/Transforms/LoopVectorize/dbg.value.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/dbg.value.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/dbg.value.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/dbg.value.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,77 @@
+; RUN: opt < %s -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine | FileCheck %s
+; Make sure we vectorize with debugging turned on.
+
+source_filename = "test/Transforms/LoopVectorize/dbg.value.ll"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at A = global [1024 x i32] zeroinitializer, align 16, !dbg !0
+ at B = global [1024 x i32] zeroinitializer, align 16, !dbg !7
+ at C = global [1024 x i32] zeroinitializer, align 16, !dbg !9
+; CHECK-LABEL: @test(
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test() #0 !dbg !15 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 0, metadata !19, metadata !21), !dbg !22
+  br label %for.body, !dbg !22
+
+for.body:                                         ; preds = %for.body, %entry
+  ;CHECK: load <4 x i32>
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv, !dbg !23
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !23
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @C, i64 0, i64 %indvars.iv, !dbg !23
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !23
+  %add = add nsw i32 %1, %0, !dbg !23
+  %arrayidx4 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv, !dbg !23
+  store i32 %add, i32* %arrayidx4, align 4, !dbg !23
+  %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !22
+  tail call void @llvm.dbg.value(metadata !12, metadata !19, metadata !21), !dbg !22
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !22
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024, !dbg !22
+  br i1 %exitcond, label %for.body, label %for.end, !dbg !22
+
+for.end:                                          ; preds = %for.body
+  ret i32 0, !dbg !25
+}
+
+; Function Attrs: nounwind readnone
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!11}
+!llvm.module.flags = !{!14}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = !DIGlobalVariable(name: "A", scope: null, file: !2, line: 1, type: !3, isLocal: false, isDefinition: true)
+!2 = !DIFile(filename: "test", directory: "/path/to/somewhere")
+!3 = !DICompositeType(tag: DW_TAG_array_type, baseType: !4, size: 32768, align: 32, elements: !5)
+!4 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!5 = !{!6}
+!6 = !{i32 786465, i64 0, i64 1024}
+!7 = !DIGlobalVariableExpression(var: !8, expr: !DIExpression())
+!8 = !DIGlobalVariable(name: "B", scope: null, file: !2, line: 2, type: !3, isLocal: false, isDefinition: true)
+!9 = !DIGlobalVariableExpression(var: !10, expr: !DIExpression())
+!10 = !DIGlobalVariable(name: "C", scope: null, file: !2, line: 3, type: !3, isLocal: false, isDefinition: true)
+!11 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !12, retainedTypes: !12, globals: !13)
+!12 = !{}
+!13 = !{!0, !7, !9}
+!14 = !{i32 1, !"Debug Info Version", i32 3}
+!15 = distinct !DISubprogram(name: "test", linkageName: "test", scope: !2, file: !2, line: 5, type: !16, isLocal: false, isDefinition: true, scopeLine: 5, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !11, retainedNodes: !18)
+!16 = !DISubroutineType(types: !17)
+!17 = !{!4}
+!18 = !{!19}
+!19 = !DILocalVariable(name: "i", scope: !20, file: !2, line: 6, type: !4)
+!20 = distinct !DILexicalBlock(scope: !15, file: !2, line: 6)
+!21 = !DIExpression()
+!22 = !DILocation(line: 6, scope: !20)
+!23 = !DILocation(line: 7, scope: !24)
+!24 = distinct !DILexicalBlock(scope: !20, file: !2, line: 6)
+!25 = !DILocation(line: 9, scope: !15)
+

Added: llvm/trunk/test/Transforms/LoopVectorize/dead_instructions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/dead_instructions.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/dead_instructions.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/dead_instructions.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,42 @@
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @dead_instructions_01
+;
+; This test ensures that we don't generate trivially dead instructions prior to
+; instruction simplification. We don't need to generate instructions
+; corresponding to the original induction variable update or branch condition,
+; since we rewrite the loop structure.
+;
+; CHECK:     vector.body:
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %[[I0:.+]] = add i64 %index, 0
+; CHECK:       %[[I2:.+]] = add i64 %index, 2
+; CHECK:       getelementptr inbounds i64, i64* %a, i64 %[[I0]]
+; CHECK:       getelementptr inbounds i64, i64* %a, i64 %[[I2]]
+; CHECK-NOT:   add nuw nsw i64 %[[I0]], 1
+; CHECK-NOT:   add nuw nsw i64 %[[I2]], 1
+; CHECK-NOT:   icmp slt i64 {{.*}}, %n
+; CHECK:       %index.next = add i64 %index, 4
+; CHECK:       %[[CMP:.+]] = icmp eq i64 %index.next, %n.vec
+; CHECK:       br i1 %[[CMP]], label %middle.block, label %vector.body
+;
+define i64 @dead_instructions_01(i64 *%a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %r = phi i64 [ %tmp2, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %tmp1 = load i64, i64* %tmp0, align 8
+  %tmp2 = add i64 %tmp1, %r
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp3  = phi i64 [ %tmp2, %for.body ]
+  ret i64 %tmp3
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/debugloc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/debugloc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/debugloc.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/debugloc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,89 @@
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we are preserving debug info in the vectorized code.
+
+; CHECK: for.body.lr.ph
+; CHECK:   min.iters.check = icmp ult i64 {{.*}}, 2, !dbg !{{[0-9]+}}
+; CHECK: vector.body
+; CHECK:   index {{.*}}, !dbg ![[LOC:[0-9]+]]
+; CHECK:   getelementptr inbounds i32, i32* %a, {{.*}}, !dbg ![[LOC]]
+; CHECK:   load <2 x i32>, <2 x i32>* {{.*}}, !dbg ![[LOC]]
+; CHECK:   add <2 x i32> {{.*}}, !dbg ![[LOC]]
+; CHECK:   add i64 %index, 2, !dbg ![[LOC]]
+; CHECK:   icmp eq i64 %index.next, %n.vec, !dbg ![[LOC]]
+; CHECK: middle.block
+; CHECK:   add <2 x i32> %{{.*}}, %rdx.shuf, !dbg ![[LOC]]
+; CHECK:   extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[LOC]]
+
+define i32 @f(i32* nocapture %a, i32 %size) #0 !dbg !4 {
+entry:
+  call void @llvm.dbg.value(metadata i32* %a, metadata !13, metadata !DIExpression()), !dbg !19
+  call void @llvm.dbg.value(metadata i32 %size, metadata !14, metadata !DIExpression()), !dbg !19
+  call void @llvm.dbg.value(metadata i32 0, metadata !15, metadata !DIExpression()), !dbg !20
+  call void @llvm.dbg.value(metadata i32 0, metadata !16, metadata !DIExpression()), !dbg !21
+  %cmp4 = icmp eq i32 %size, 0, !dbg !21
+  br i1 %cmp4, label %for.end, label %for.body.lr.ph, !dbg !21
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body, !dbg !21
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %sum.05 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv, !dbg !22
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !22
+  %add = add i32 %0, %sum.05, !dbg !22
+  %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !22
+  call void @llvm.dbg.value(metadata !{null}, metadata !16, metadata !DIExpression()), !dbg !22
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !22
+  %exitcond = icmp ne i32 %lftr.wideiv, %size, !dbg !22
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge, !dbg !21
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  call void @llvm.dbg.value(metadata i32 %add.lcssa, metadata !15, metadata !DIExpression()), !dbg !22
+  br label %for.end, !dbg !21
+
+for.end:                                          ; preds = %entry, %for.cond.for.end_crit_edge
+  %sum.0.lcssa = phi i32 [ %add.lcssa, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa, !dbg !26
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind readonly ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18, !27}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 185038) (llvm/trunk 185097)", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "-", directory: "/Volumes/Data/backedup/dev/os/llvm/debug")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "f", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !12)
+!5 = !DIFile(filename: "<stdin>", directory: "/Volumes/Data/backedup/dev/os/llvm/debug")
+!6 = !DIFile(filename: "<stdin>", directory: "/Volumes/Data/backedup/dev/os/llvm/debug")
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !10, !11}
+!9 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !9)
+!11 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+!12 = !{!13, !14, !15, !16}
+!13 = !DILocalVariable(name: "a", line: 3, arg: 1, scope: !4, file: !6, type: !10)
+!14 = !DILocalVariable(name: "size", line: 3, arg: 2, scope: !4, file: !6, type: !11)
+!15 = !DILocalVariable(name: "sum", line: 4, scope: !4, file: !6, type: !11)
+!16 = !DILocalVariable(name: "i", line: 5, scope: !17, file: !6, type: !11)
+!17 = distinct !DILexicalBlock(line: 5, column: 0, file: !5, scope: !4)
+!18 = !{i32 2, !"Dwarf Version", i32 3}
+!19 = !DILocation(line: 3, scope: !4)
+!20 = !DILocation(line: 4, scope: !4)
+!21 = !DILocation(line: 5, scope: !17)
+!22 = !DILocation(line: 6, scope: !17)
+!26 = !DILocation(line: 7, scope: !4)
+!27 = !{i32 1, !"Debug Info Version", i32 3}

Added: llvm/trunk/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; getDemandedBits() is called on the pointer-typed GEP instruction here.
+; Only make sure we do not crash.
+
+; CHECK: @test
+define void @test(i8* %ptr, i8* %ptr_end) {
+start:
+  br label %loop
+
+loop:
+  %ptr2 = phi i8* [ %ptr3, %loop ], [ %ptr, %start ]
+  %x = sext i8 undef to i64
+  %ptr3 = getelementptr inbounds i8, i8* %ptr2, i64 1
+  %cmp = icmp ult i8* %ptr3, %ptr_end
+  br i1 %cmp, label %loop, label %end
+
+end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,77 @@
+; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize < %s 2>&1 | FileCheck %s
+
+;  1     extern int map[];
+;  2     extern int out[];
+;  3
+;  4     void f(int a, int n) {
+;  5       for (int i = 0; i < n; ++i) {
+;  6         out[i] = a;
+;  7         a = map[a];
+;  8       }
+;  9     }
+
+; CHECK: remark: /tmp/s.c:5:3: loop not vectorized: value that could not be identified as reduction is used outside the loop
+
+; %a.addr.08 is the phi corresponding to the remark.  It does not have debug
+; location attached.  In this case we should use the debug location of the
+; loop rather than emitting <unknown>:0:0:
+
+; ModuleID = '/tmp/s.c'
+source_filename = "/tmp/s.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+ at out = external local_unnamed_addr global [0 x i32], align 4
+ at map = external local_unnamed_addr global [0 x i32], align 4
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @f(i32 %a, i32 %n) local_unnamed_addr #0 !dbg !6 {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0, !dbg !8
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup, !dbg !9
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64, !dbg !9
+  br label %for.body, !dbg !10
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void, !dbg !11
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.addr.08 = phi i32 [ %0, %for.body ], [ %a, %for.body.preheader ]
+
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @out, i64 0, i64 %indvars.iv, !dbg !10
+  store i32 %a.addr.08, i32* %arrayidx, align 4, !dbg !12, !tbaa !13
+  %idxprom1 = sext i32 %a.addr.08 to i64, !dbg !17
+  %arrayidx2 = getelementptr inbounds [0 x i32], [0 x i32]* @map, i64 0, i64 %idxprom1, !dbg !17
+  %0 = load i32, i32* %arrayidx2, align 4, !dbg !17, !tbaa !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !9
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !9
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !9, !llvm.loop !18
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"}
+!6 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !7, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 5, column: 21, scope: !6)
+!9 = !DILocation(line: 5, column: 3, scope: !6)
+!10 = !DILocation(line: 6, column: 5, scope: !6)
+!11 = !DILocation(line: 9, column: 1, scope: !6)
+!12 = !DILocation(line: 6, column: 12, scope: !6)
+!13 = !{!14, !14, i64 0}
+!14 = !{!"int", !15, i64 0}
+!15 = !{!"omnipotent char", !16, i64 0}
+!16 = !{!"Simple C/C++ TBAA"}
+!17 = !DILocation(line: 7, column: 9, scope: !6)
+!18 = distinct !{!18, !9}

Added: llvm/trunk/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,200 @@
+; RUN: opt -S -loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness < %s 2>&1 | FileCheck %s
+; RUN: opt -S -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness < %s 2>&1 | FileCheck %s
+
+;   1	void cold(char *A, char *B, char *C, char *D, char *E, int N) {
+;   2	  for(int i = 0; i < N; i++) {
+;   3	    A[i + 1] = A[i] + B[i];
+;   4	    C[i] = D[i] * E[i];
+;   5	  }
+;   6	}
+;   7
+;   8	void hot(char *A, char *B, char *C, char *D, char *E, int N) {
+;   9	  for(int i = 0; i < N; i++) {
+;  10	    A[i + 1] = A[i] + B[i];
+;  11	    C[i] = D[i] * E[i];
+;  12	  }
+;  13	}
+;  14
+;  15	void unknown(char *A, char *B, char *C, char *D, char *E, int N) {
+;  16	  for(int i = 0; i < N; i++) {
+;  17	    A[i + 1] = A[i] + B[i];
+;  18	    C[i] = D[i] * E[i];
+;  19	  }
+;  20	}
+
+; CHECK: remark: /tmp/s.c:2:3: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop (hotness: 300)
+; CHECK: remark: /tmp/s.c:9:3: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop (hotness: 5000)
+; CHECK: remark: /tmp/s.c:16:3: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop{{$}}
+
+; ModuleID = '/tmp/s.c'
+source_filename = "/tmp/s.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @cold(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !7 !prof !56 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !9
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !10, !prof !58
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !12
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !12, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !16
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !16, !tbaa !13
+  %add = add i8 %1, %0, !dbg !17
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !18
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !19, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !20
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !20, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !21
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !21, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !22
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !23
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !24, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !10
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !10, !llvm.loop !25, !prof !59
+
+for.cond.cleanup:
+  ret void, !dbg !11
+}
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @hot(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !26 !prof !57 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !27
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !28, !prof !58
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !30
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !30, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !31
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !31, !tbaa !13
+  %add = add i8 %1, %0, !dbg !32
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !28
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !33
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !34, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !35
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !35, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !36
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !36, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !37
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !38
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !39, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !28
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !28
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !28, !llvm.loop !40, !prof !59
+
+for.cond.cleanup:
+  ret void, !dbg !29
+}
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @unknown(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !41 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !42
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !43
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !45
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !45, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !46
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !46, !tbaa !13
+  %add = add i8 %1, %0, !dbg !47
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !43
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !48
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !49, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !50
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !50, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !51
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !51, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !52
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !53
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !54, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !43
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !43
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !43, !llvm.loop !55
+
+for.cond.cleanup:
+  ret void, !dbg !44
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 273572) (llvm/trunk 273585)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 3.9.0 (trunk 273572) (llvm/trunk 273585)"}
+!7 = distinct !DISubprogram(name: "cold", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 2, column: 20, scope: !7)
+!10 = !DILocation(line: 2, column: 3, scope: !7)
+!11 = !DILocation(line: 6, column: 1, scope: !7)
+!12 = !DILocation(line: 3, column: 16, scope: !7)
+!13 = !{!14, !14, i64 0}
+!14 = !{!"omnipotent char", !15, i64 0}
+!15 = !{!"Simple C/C++ TBAA"}
+!16 = !DILocation(line: 3, column: 23, scope: !7)
+!17 = !DILocation(line: 3, column: 21, scope: !7)
+!18 = !DILocation(line: 3, column: 5, scope: !7)
+!19 = !DILocation(line: 3, column: 14, scope: !7)
+!20 = !DILocation(line: 4, column: 12, scope: !7)
+!21 = !DILocation(line: 4, column: 19, scope: !7)
+!22 = !DILocation(line: 4, column: 17, scope: !7)
+!23 = !DILocation(line: 4, column: 5, scope: !7)
+!24 = !DILocation(line: 4, column: 10, scope: !7)
+!25 = distinct !{!25, !10}
+!26 = distinct !DISubprogram(name: "hot", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!27 = !DILocation(line: 9, column: 20, scope: !26)
+!28 = !DILocation(line: 9, column: 3, scope: !26)
+!29 = !DILocation(line: 13, column: 1, scope: !26)
+!30 = !DILocation(line: 10, column: 16, scope: !26)
+!31 = !DILocation(line: 10, column: 23, scope: !26)
+!32 = !DILocation(line: 10, column: 21, scope: !26)
+!33 = !DILocation(line: 10, column: 5, scope: !26)
+!34 = !DILocation(line: 10, column: 14, scope: !26)
+!35 = !DILocation(line: 11, column: 12, scope: !26)
+!36 = !DILocation(line: 11, column: 19, scope: !26)
+!37 = !DILocation(line: 11, column: 17, scope: !26)
+!38 = !DILocation(line: 11, column: 5, scope: !26)
+!39 = !DILocation(line: 11, column: 10, scope: !26)
+!40 = distinct !{!40, !28}
+!41 = distinct !DISubprogram(name: "unknown", scope: !1, file: !1, line: 15, type: !8, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!42 = !DILocation(line: 16, column: 20, scope: !41)
+!43 = !DILocation(line: 16, column: 3, scope: !41)
+!44 = !DILocation(line: 20, column: 1, scope: !41)
+!45 = !DILocation(line: 17, column: 16, scope: !41)
+!46 = !DILocation(line: 17, column: 23, scope: !41)
+!47 = !DILocation(line: 17, column: 21, scope: !41)
+!48 = !DILocation(line: 17, column: 5, scope: !41)
+!49 = !DILocation(line: 17, column: 14, scope: !41)
+!50 = !DILocation(line: 18, column: 12, scope: !41)
+!51 = !DILocation(line: 18, column: 19, scope: !41)
+!52 = !DILocation(line: 18, column: 17, scope: !41)
+!53 = !DILocation(line: 18, column: 5, scope: !41)
+!54 = !DILocation(line: 18, column: 10, scope: !41)
+!55 = distinct !{!55, !43}
+!56 = !{!"function_entry_count", i64 3}
+!57 = !{!"function_entry_count", i64 50}
+!58 = !{!"branch_weights", i32 99, i32 1}
+!59 = !{!"branch_weights", i32 1, i32 99}

Added: llvm/trunk/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/diag-with-hotness-info.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/diag-with-hotness-info.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/diag-with-hotness-info.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,213 @@
+; RUN: opt -S -loop-vectorize -pass-remarks-missed=loop-vectorize \
+; RUN:     -pass-remarks-with-hotness < %s 2>&1 | \
+; RUN:     FileCheck -check-prefix=HOTNESS -check-prefix=BOTH %s
+
+; RUN: opt -S -loop-vectorize -pass-remarks-missed=loop-vectorize < %s 2>&1 | \
+; RUN:     FileCheck -check-prefix=NO_HOTNESS -check-prefix=BOTH %s
+
+
+; RUN: opt -S -passes=loop-vectorize -pass-remarks-missed=loop-vectorize \
+; RUN:     -pass-remarks-with-hotness < %s 2>&1 | \
+; RUN:     FileCheck -check-prefix=HOTNESS -check-prefix=BOTH %s
+
+; RUN: opt -S -passes=loop-vectorize \
+; RUN:     -pass-remarks-missed=loop-vectorize < %s 2>&1 | \
+; RUN:     FileCheck -check-prefix=NO_HOTNESS -check-prefix=BOTH %s
+
+
+;   1	void cold(char *A, char *B, char *C, char *D, char *E, int N) {
+;   2	  for(int i = 0; i < N; i++) {
+;   3	    A[i + 1] = A[i] + B[i];
+;   4	    C[i] = D[i] * E[i];
+;   5	  }
+;   6	}
+;   7
+;   8	void hot(char *A, char *B, char *C, char *D, char *E, int N) {
+;   9	  for(int i = 0; i < N; i++) {
+;  10	    A[i + 1] = A[i] + B[i];
+;  11	    C[i] = D[i] * E[i];
+;  12	  }
+;  13	}
+;  14
+;  15	void unknown(char *A, char *B, char *C, char *D, char *E, int N) {
+;  16	  for(int i = 0; i < N; i++) {
+;  17	    A[i + 1] = A[i] + B[i];
+;  18	    C[i] = D[i] * E[i];
+;  19	  }
+;  20	}
+
+; HOTNESS: remark: /tmp/s.c:2:3: loop not vectorized (hotness: 300)
+; NO_HOTNESS: remark: /tmp/s.c:2:3: loop not vectorized{{$}}
+; HOTNESS: remark: /tmp/s.c:9:3: loop not vectorized (hotness: 5000)
+; NO_HOTNESS: remark: /tmp/s.c:9:3: loop not vectorized{{$}}
+; BOTH: remark: /tmp/s.c:16:3: loop not vectorized{{$}}
+
+; ModuleID = '/tmp/s.c'
+source_filename = "/tmp/s.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @cold(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !7 !prof !56 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !9
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !10, !prof !58
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !12
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !12, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !16
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !16, !tbaa !13
+  %add = add i8 %1, %0, !dbg !17
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !18
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !19, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !20
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !20, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !21
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !21, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !22
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !23
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !24, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !10
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !10, !llvm.loop !25, !prof !59
+
+for.cond.cleanup:
+  ret void, !dbg !11
+}
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @hot(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !26 !prof !57 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !27
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !28, !prof !58
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !30
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !30, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !31
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !31, !tbaa !13
+  %add = add i8 %1, %0, !dbg !32
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !28
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !33
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !34, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !35
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !35, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !36
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !36, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !37
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !38
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !39, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !28
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !28
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !28, !llvm.loop !40, !prof !59
+
+for.cond.cleanup:
+  ret void, !dbg !29
+}
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @unknown(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !41 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !42
+  br i1 %cmp28, label %for.body, label %for.cond.cleanup, !dbg !43
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void, !dbg !44
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !45
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !45, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !46
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !46, !tbaa !13
+  %add = add i8 %1, %0, !dbg !47
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !43
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !48
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !49, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !50
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !50, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !51
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !51, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !52
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !53
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !54, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !43
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !43
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !43, !llvm.loop !55
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 273572) (llvm/trunk 273585)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 3.9.0 (trunk 273572) (llvm/trunk 273585)"}
+!7 = distinct !DISubprogram(name: "cold", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 2, column: 20, scope: !7)
+!10 = !DILocation(line: 2, column: 3, scope: !7)
+!11 = !DILocation(line: 6, column: 1, scope: !7)
+!12 = !DILocation(line: 3, column: 16, scope: !7)
+!13 = !{!14, !14, i64 0}
+!14 = !{!"omnipotent char", !15, i64 0}
+!15 = !{!"Simple C/C++ TBAA"}
+!16 = !DILocation(line: 3, column: 23, scope: !7)
+!17 = !DILocation(line: 3, column: 21, scope: !7)
+!18 = !DILocation(line: 3, column: 5, scope: !7)
+!19 = !DILocation(line: 3, column: 14, scope: !7)
+!20 = !DILocation(line: 4, column: 12, scope: !7)
+!21 = !DILocation(line: 4, column: 19, scope: !7)
+!22 = !DILocation(line: 4, column: 17, scope: !7)
+!23 = !DILocation(line: 4, column: 5, scope: !7)
+!24 = !DILocation(line: 4, column: 10, scope: !7)
+!25 = distinct !{!25, !10}
+!26 = distinct !DISubprogram(name: "hot", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!27 = !DILocation(line: 9, column: 20, scope: !26)
+!28 = !DILocation(line: 9, column: 3, scope: !26)
+!29 = !DILocation(line: 13, column: 1, scope: !26)
+!30 = !DILocation(line: 10, column: 16, scope: !26)
+!31 = !DILocation(line: 10, column: 23, scope: !26)
+!32 = !DILocation(line: 10, column: 21, scope: !26)
+!33 = !DILocation(line: 10, column: 5, scope: !26)
+!34 = !DILocation(line: 10, column: 14, scope: !26)
+!35 = !DILocation(line: 11, column: 12, scope: !26)
+!36 = !DILocation(line: 11, column: 19, scope: !26)
+!37 = !DILocation(line: 11, column: 17, scope: !26)
+!38 = !DILocation(line: 11, column: 5, scope: !26)
+!39 = !DILocation(line: 11, column: 10, scope: !26)
+!40 = distinct !{!40, !28}
+!41 = distinct !DISubprogram(name: "unknown", scope: !1, file: !1, line: 15, type: !8, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!42 = !DILocation(line: 16, column: 20, scope: !41)
+!43 = !DILocation(line: 16, column: 3, scope: !41)
+!44 = !DILocation(line: 20, column: 1, scope: !41)
+!45 = !DILocation(line: 17, column: 16, scope: !41)
+!46 = !DILocation(line: 17, column: 23, scope: !41)
+!47 = !DILocation(line: 17, column: 21, scope: !41)
+!48 = !DILocation(line: 17, column: 5, scope: !41)
+!49 = !DILocation(line: 17, column: 14, scope: !41)
+!50 = !DILocation(line: 18, column: 12, scope: !41)
+!51 = !DILocation(line: 18, column: 19, scope: !41)
+!52 = !DILocation(line: 18, column: 17, scope: !41)
+!53 = !DILocation(line: 18, column: 5, scope: !41)
+!54 = !DILocation(line: 18, column: 10, scope: !41)
+!55 = distinct !{!55, !43}
+!56 = !{!"function_entry_count", i64 3}
+!57 = !{!"function_entry_count", i64 50}
+!58 = !{!"branch_weights", i32 99, i32 1}
+!59 = !{!"branch_weights", i32 1, i32 99}

Added: llvm/trunk/test/Transforms/LoopVectorize/disable_nonforced.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/disable_nonforced.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/disable_nonforced.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/disable_nonforced.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Check that the disable_nonforced loop property is honored by the
+; loop vectorizer.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced(
+; CHECK-NOT: x i32>
+define void @disable_nonforced(i32* nocapture %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.disable_nonforced"}}

Added: llvm/trunk/test/Transforms/LoopVectorize/disable_nonforced_enable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/disable_nonforced_enable.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/disable_nonforced_enable.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/disable_nonforced_enable.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Check whether the llvm.loop.vectorize.enable loop property overrides
+; llvm.loop.disable_nonforced.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced_enable(
+; CHECK: store <2 x i32>
+define void @disable_nonforced_enable(i32* nocapture %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.disable_nonforced"}, !{!"llvm.loop.vectorize.enable", i32 1}}

Added: llvm/trunk/test/Transforms/LoopVectorize/discriminator.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/discriminator.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/discriminator.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/discriminator.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,76 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck --check-prefix=DBG_VALUE --check-prefix=LOOPVEC_4_1 %s
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=3 < %s | FileCheck --check-prefix=DBG_VALUE --check-prefix=LOOPVEC_2_3 %s
+; RUN: opt -S -loop-unroll  -unroll-count=5 < %s | FileCheck --check-prefix=DBG_VALUE --check-prefix=LOOPUNROLL_5 %s
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -loop-unroll -unroll-count=2 < %s | FileCheck --check-prefix=DBG_VALUE --check-prefix=LOOPVEC_UNROLL %s
+
+; Test if vectorization/unroll factor is recorded in discriminator.
+;
+; Original source code:
+;  1 int *a;
+;  2 int *b;
+;  3 
+;  4 void foo() {
+;  5   for (int i = 0; i < 4096; i++)
+;  6     a[i] += b[i];
+;  7 }
+
+ at a = local_unnamed_addr global i32* null, align 8
+ at b = local_unnamed_addr global i32* null, align 8
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+define void @_Z3foov() local_unnamed_addr #0 !dbg !6 {
+  %1 = load i32*, i32** @b, align 8, !dbg !8, !tbaa !9
+  %2 = load i32*, i32** @a, align 8, !dbg !13, !tbaa !9
+  br label %3, !dbg !14
+
+; <label>:3:                                      ; preds = %3, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %3 ]
+  %4 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv, !dbg !8
+  %5 = load i32, i32* %4, align 4, !dbg !8, !tbaa !15
+  %6 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv, !dbg !13
+  %7 = load i32, i32* %6, align 4, !dbg !17, !tbaa !15
+  %8 = add nsw i32 %7, %5, !dbg !17
+;DBG_VALUE: call void @llvm.dbg.declare{{.*}}!dbg ![[DBG:[0-9]*]]
+  call void @llvm.dbg.declare(metadata i32 %8, metadata !22, metadata !DIExpression()), !dbg !17
+  store i32 %8, i32* %6, align 4, !dbg !17, !tbaa !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !18
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096, !dbg !19
+  br i1 %exitcond, label %9, label %3, !dbg !14, !llvm.loop !20
+
+; <label>:9:                                      ; preds = %3
+  ret void, !dbg !21
+}
+
+;DBG_VALUE: ![[TOP:[0-9]*]] = distinct !DISubprogram(name: "foo"
+;LOOPVEC_4_1: discriminator: 17
+;LOOPVEC_2_3: discriminator: 25
+;LOOPUNROLL_5: discriminator: 21
+; When unrolling after loop vectorize, both vec_body and remainder loop
+; are unrolled.
+;LOOPVEC_UNROLL: discriminator: 385
+;LOOPVEC_UNROLL: discriminator: 9
+;DBG_VALUE: ![[DBG]] = {{.*}}, scope: ![[TOP]]
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, debugInfoForProfiling: true)
+!1 = !DIFile(filename: "a.cc", directory: "/")
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, unit: !0)
+!8 = !DILocation(line: 6, column: 13, scope: !6)
+!9 = !{!10, !10, i64 0}
+!10 = !{!"any pointer", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C++ TBAA"}
+!13 = !DILocation(line: 6, column: 5, scope: !6)
+!14 = !DILocation(line: 5, column: 3, scope: !6)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !11, i64 0}
+!17 = !DILocation(line: 6, column: 10, scope: !6)
+!18 = !DILocation(line: 5, column: 30, scope: !6)
+!19 = !DILocation(line: 5, column: 21, scope: !6)
+!20 = distinct !{!20, !14}
+!21 = !DILocation(line: 7, column: 1, scope: !6)
+!22 = !DILocalVariable(name: "a", arg: 1, scope: !6, file: !1, line: 10)

Added: llvm/trunk/test/Transforms/LoopVectorize/ee-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ee-crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ee-crash.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ee-crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; This test checks that we deal with an in-loop extractelement (for now, this
+; means not crashing by not vectorizing).
+; CHECK-LABEL: @_Z4foo1Pii(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+define i32 @_Z4foo1Pii(i32* %A, i32 %n, <2 x i32> %q) #0 {
+entry:
+  %idx.ext = sext i32 %n to i64
+  %add.ptr = getelementptr inbounds i32, i32* %A, i64 %idx.ext
+  %cmp3.i = icmp eq i32 %n, 0
+  br i1 %cmp3.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+for.body.i:                                       ; preds = %entry, %for.body.i
+  %__init.addr.05.i = phi i32 [ %add.i, %for.body.i ], [ 0, %entry ]
+  %__first.addr.04.i = phi i32* [ %incdec.ptr.i, %for.body.i ], [ %A, %entry ]
+  %0 = load i32, i32* %__first.addr.04.i, align 4
+  %q1 = extractelement <2 x i32> %q, i32 %n
+  %q2 = add nsw i32 %0, %q1
+  %add.i = add nsw i32 %q2, %__init.addr.05.i
+  %incdec.ptr.i = getelementptr inbounds i32, i32* %__first.addr.04.i, i64 1
+  %cmp.i = icmp eq i32* %incdec.ptr.i, %add.ptr
+  br i1 %cmp.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
+  %__init.addr.0.lcssa.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  ret i32 %__init.addr.0.lcssa.i
+}
+
+attributes #0 = { nounwind readonly ssp uwtable }
+

Added: llvm/trunk/test/Transforms/LoopVectorize/exact.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/exact.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/exact.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/exact.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @lshr_exact(
+; CHECK: lshr exact <4 x i32>
+define void @lshr_exact(i32* %x) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %conv1 = lshr exact i32 %0, 1
+  store i32 %conv1, i32* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,236 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that outer loops annotated only with the expected explicit
+; vectorization hints are collected for vectorization instead of inner loops.
+
+; Root C/C++ source code for all the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+;   int i, j;
+; #pragma clang loop vectorize(enable)
+;   for (i = 0; i < N; i++) {
+;     for (j = 0; j < M; j++) {
+;       a[i*M+j] = b[i*M+j] * b[i*M+j];
+;     }
+;   }
+; }
+
+; Case 1: Annotated outer loop WITH vector width information must be collected.
+
+; CHECK-LABEL: vector_width
+; CHECK: LV: Loop hints: force=enabled width=4 unroll=0
+; CHECK: LV: We can vectorize this outer loop!
+; CHECK: LV: Using user VF 4 to build VPlans.
+; CHECK-NOT: LV: Loop hints: force=?
+; CHECK-NOT: LV: Found a loop: inner.body
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @vector_width(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 2: Annotated outer loop WITHOUT vector width information must be collected.
+
+; CHECK-LABEL: case2
+; CHECK: LV: Loop hints: force=enabled width=0 unroll=0
+; CHECK: LV: We can vectorize this outer loop!
+; CHECK: LV: Using VF 1 to build VPlans.
+
+define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                          ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                        ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                  ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                        ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !9
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 3: Annotated outer loop WITH vector width and interleave information
+; doesn't have to be collected.
+
+; CHECK-LABEL: case3
+; CHECK-NOT: LV: Loop hints: force=enabled
+; CHECK-NOT: LV: We can vectorize this outer loop!
+; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Found a loop: inner.body
+
+define void @case3(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                         ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                       ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                         ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                       ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !11
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 4: Outer loop without any explicit vectorization annotation doesn't have
+; to be collected.
+
+; CHECK-LABEL: case4
+; CHECK-NOT: LV: Loop hints: force=enabled
+; CHECK-NOT: LV: We can vectorize this outer loop!
+; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Found a loop: inner.body
+
+define void @case4(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                         ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                       ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                  ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                        ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+; Case 1
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 4}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
+; Case 2
+!9 = distinct !{!9, !8}
+; Case 3
+!10 = !{!"llvm.loop.interleave.count", i32 2}
+!11 = distinct !{!11, !7, !10, !8}

Added: llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,177 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that LV bails out on explicit vectorization outer loops that contain
+; divergent inner loops.
+
+; Root C/C++ source code for all the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+;   int i, j;
+; #pragma clang loop vectorize(enable) vectorize_width(8)
+;   for (i = 0; i < N; i++) {
+;     // Tested inner loop. It will be replaced per test.
+;     for (j = 0; j < M; j++) {
+;       a[i*M+j] = b[i*M+j] * b[i*M+j];
+;     }
+;   }
+; }
+
+; Case 1 (for (j = i; j < M; j++)): Inner loop with divergent IV start.
+
+; CHECK-LABEL: iv_start
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @iv_start(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp33 = icmp sgt i32 %N, 0
+  br i1 %cmp33, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count41 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ]
+  %cmp231 = icmp slt i64 %indvars.iv38, %0
+  br i1 %cmp231, label %inner.ph, label %outer.inc
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv38, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.body, %inner.ph
+  %indvars.iv35 = phi i64 [ %indvars.iv38, %inner.ph ], [ %indvars.iv.next36, %inner.body ]
+  %2 = add nsw i64 %indvars.iv35, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond = icmp eq i64 %indvars.iv.next36, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+  %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41
+  br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+
+; Case 2 (for (j = 0; j < i; j++)): Inner loop with divergent upper-bound.
+
+; CHECK-LABEL: loop_ub
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @loop_ub(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %0 = sext i32 %M to i64
+  %wide.trip.count41 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ]
+  %cmp230 = icmp eq i64 %indvars.iv38, 0
+  br i1 %cmp230, label %outer.inc, label %inner.ph
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv38, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %indvars.iv38
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+  %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41
+  br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 3 (for (j = 0; j < M; j+=i)): Inner loop with divergent step.
+
+; CHECK-LABEL: iv_step
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @iv_step(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp33 = icmp sgt i32 %N, 0
+  br i1 %cmp33, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %cmp231 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %for.inc14, %outer.ph
+  %indvars.iv39 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next40, %for.inc14 ]
+  br i1 %cmp231, label %inner.ph, label %for.inc14
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv39, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.ph, %inner.body
+  %indvars.iv36 = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next37, %inner.body ]
+  %2 = add nsw i64 %indvars.iv36, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, %indvars.iv39
+  %cmp2 = icmp slt i64 %indvars.iv.next37, %0
+  br i1 %cmp2, label %inner.body, label %for.inc14
+
+for.inc14:                                 ; preds = %inner.body, %outer.body
+  %indvars.iv.next40 = add nuw nsw i64 %indvars.iv39, 1
+  %exitcond = icmp eq i64 %indvars.iv.next40, %wide.trip.count
+  br i1 %exitcond, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                 ; preds = %for.inc14, %entry
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}

Added: llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,138 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that LV can handle explicit vectorization outer loops with uniform branches
+; but bails out on outer loops with divergent branches.
+
+; Root C/C++ source code for the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+;   int i, j;
+; #pragma clang loop vectorize(enable) vectorize_width(8)
+;   for (i = 0; i < N; i++) {
+;     // Tested conditional branch. COND will be replaced per test.
+;     if (COND)
+;       for (j = 0; j < M; j++) {
+;         a[i*M+j] = b[i*M+j] * b[i*M+j];
+;       }
+;   }
+; }
+
+; Case 1 (COND => M == N): Outer loop with uniform conditional branch.
+
+; CHECK-LABEL: uniform_branch
+; CHECK: LV: We can vectorize this outer loop!
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @uniform_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp39 = icmp sgt i32 %N, 0
+  br i1 %cmp39, label %outer.ph, label %for.end19
+
+outer.ph:                                   ; preds = %entry
+  %cmp337 = icmp slt i32 %M, 1
+  %0 = sext i32 %M to i64
+  %N64 = zext i32 %N to i64
+  %M64 = zext i32 %M to i64
+  %cmp1 = icmp ne i32 %M, %N ; Uniform condition
+  %brmerge = or i1 %cmp1, %cmp337 ; Uniform condition
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ]
+  %1 = mul nsw i64 %indvars.iv42, %0
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  br i1 %brmerge, label %outer.inc, label %inner.ph ; Supported uniform branch
+
+inner.ph:                                   ; preds = %outer.body
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.ph, %inner.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ]
+  %3 = add nsw i64 %indvars.iv, %1
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3
+  %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2
+  %mul12 = mul nsw i32 %4, %4
+  %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3
+  store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %M64
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64
+  br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6
+
+for.end19:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+
+; Case 2 (COND => B[i * M] == 0): Outer loop with divergent conditional branch.
+
+; CHECK-LABEL: divergent_branch
+; CHECK: Unsupported conditional branch.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @divergent_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp39 = icmp sgt i32 %N, 0
+  br i1 %cmp39, label %outer.ph, label %for.end19
+
+outer.ph:                                   ; preds = %entry
+  %cmp337 = icmp slt i32 %M, 1
+  %0 = sext i32 %M to i64
+  %N64 = zext i32 %N to i64
+  %M64 = zext i32 %M to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ]
+  %1 = mul nsw i64 %indvars.iv42, %0
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %cmp1 = icmp ne i32 %2, 0 ; Divergent condition
+  %brmerge = or i1 %cmp1, %cmp337 ; Divergent condition
+  br i1 %brmerge, label %outer.inc, label %inner.ph ; Unsupported divergent branch.
+
+inner.ph:                                   ; preds = %outer.body
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.ph, %inner.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ]
+  %3 = add nsw i64 %indvars.iv, %1
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3
+  %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2
+  %mul12 = mul nsw i32 %4, %4
+  %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3
+  store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %M64
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64
+  br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6
+
+for.end19:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}

Added: llvm/trunk/test/Transforms/LoopVectorize/fcmp-vectorize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/fcmp-vectorize.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/fcmp-vectorize.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/fcmp-vectorize.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
+
+; Avoid crashing while trying to vectorize fcmp that can be folded to vector of
+; i1 true.
+define void @test1() {
+; CHECK-LABEL: test1(
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+; CHECK:         %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+; CHECK:         %index.next = add i32 %index, 4
+
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i32 [ 0, %entry ], [ %ivnext, %loop ]
+  %fcmp = fcmp uno float 0.000000e+00, 0.000000e+00
+  %ivnext = add nsw i32 %iv, 1
+  %cnd = icmp sgt i32 %iv, 142
+  br i1 %cnd, label %exit, label %loop
+
+exit:                                              ; preds = %loop
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,574 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-VF
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s --check-prefix=SINK-AFTER
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s --check-prefix=NO-SINK-AFTER
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; void recurrence_1(int *a, int *b, int n) {
+;   for(int i = 0; i < n; i++)
+;     b[i] =  a[i] + a[i - 1]
+; }
+;
+; CHECK-LABEL: @recurrence_1(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i32>
+; CHECK:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_1(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; UNROLL:         [[L2]] = load <4 x i32>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
+;
+define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) {
+entry:
+  br label %for.preheader
+
+for.preheader:
+  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0
+  %pre_load = load i32, i32* %arrayidx.phi.trans.insert
+  br label %scalar.body
+
+scalar.body:
+  %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ]
+  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+  %1 = load i32, i32* %arrayidx32
+  %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %add35 = add i32 %1, %0
+  store i32 %add35, i32* %arrayidx34
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.exit, label %scalar.body
+
+for.exit:
+  ret void
+}
+
+; int recurrence_2(int *a, int n) {
+;   int minmax;
+;   for (int i = 0; i < n; ++i)
+;     minmax = min(minmax, max(a[i] - a[i-1], 0));
+;   return minmax;
+; }
+;
+; CHECK-LABEL: @recurrence_2(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i32>
+; CHECK:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_2(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; UNROLL:         [[L2]] = load <4 x i32>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
+;
+define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
+entry:
+  %cmp27 = icmp sgt i32 %n, 0
+  br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
+
+for.preheader:
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1
+  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+  br label %scalar.body
+
+for.cond.cleanup.loopexit:
+  %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %minmax.0.lcssa
+
+scalar.body:
+  %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
+  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
+  %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %sub3 = sub nsw i32 %1, %0
+  %cmp4 = icmp sgt i32 %sub3, 0
+  %cond = select i1 %cmp4, i32 %sub3, i32 0
+  %cmp5 = icmp slt i32 %minmax.028, %cond
+  %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body
+}
+
+; void recurrence_3(short *a, double *b, int n, float f, short p) {
+;   b[0] = (double)a[0] - f * (double)p;
+;   for (int i = 1; i < n; i++)
+;     b[i] = (double)a[i] - f * (double)a[i - 1];
+; }
+;
+; CHECK-LABEL: @recurrence_3(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i16>
+; CHECK:         [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; Check also that the casts were not moved needlessly.
+; CHECK:         sitofp <4 x i16> [[L1]] to <4 x double>
+; CHECK:         sitofp <4 x i16> [[SHUF]] to <4 x double> 
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_3(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16>
+; UNROLL:         [[L2]] = load <4 x i16>
+; UNROLL:         {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3
+;
+define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) {
+entry:
+  %0 = load i16, i16* %a, align 2
+  %conv = sitofp i16 %0 to double
+  %conv1 = fpext float %f to double
+  %conv2 = sitofp i16 %p to double
+  %mul = fmul fast double %conv2, %conv1
+  %sub = fsub fast double %conv, %mul
+  store double %sub, double* %b, align 8
+  %cmp25 = icmp sgt i32 %n, 1
+  br i1 %cmp25, label %for.preheader, label %for.end
+
+for.preheader:
+  br label %scalar.body
+
+scalar.body:
+  %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ]
+  %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ]
+  %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv
+  %2 = load i16, i16* %arrayidx5, align 2
+  %conv6 = sitofp i16 %2 to double
+  %conv11 = sitofp i16 %1 to double
+  %mul12 = fmul fast double %conv11, %conv1
+  %sub13 = fsub fast double %conv6, %mul12
+  %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv
+  store double %sub13, double* %arrayidx15, align 8
+  %advars.iv.next = add nuw nsw i64 %advars.iv, 1
+  %lftr.wideiv = trunc i64 %advars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %scalar.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; void PR26734(short *a, int *b, int *c, int d, short *e) {
+;   for (; d != 21; d++) {
+;     *b &= *c;
+;     *e = *a - 6;
+;     *c = *e;
+;   }
+; }
+;
+; CHECK-LABEL: @PR26734(
+; CHECK-NOT:   vector.ph:
+; CHECK:       }
+;
+define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) {
+entry:
+  %cmp4 = icmp eq i32 %d, 21
+  br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph
+
+entry.for.end_crit_edge:
+  %.pre = load i32, i32* %b, align 4
+  br label %for.end
+
+for.body.lr.ph:
+  %0 = load i16, i16* %a, align 2
+  %sub = add i16 %0, -6
+  %conv2 = sext i16 %sub to i32
+  %c.promoted = load i32, i32* %c, align 4
+  %b.promoted = load i32, i32* %b, align 4
+  br label %for.body
+
+for.body:
+  %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ]
+  %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ]
+  %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ]
+  %and = and i32 %and6, %conv25
+  %inc = add nsw i32 %inc7, 1
+  %cmp = icmp eq i32 %inc, 21
+  br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %and.lcssa = phi i32 [ %and, %for.body ]
+  store i32 %conv2, i32* %c, align 4
+  store i32 %and.lcssa, i32* %b, align 4
+  store i16 %sub, i16* %e, align 2
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; int PR27246() {
+;   unsigned int e, n;
+;   for (int i = 1; i < 49; ++i) {
+;     for (int k = i; k > 1; --k)
+;       e = k;
+;     n = e;
+;   }
+;   return n;
+; }
+;
+; CHECK-LABEL: @PR27246(
+; CHECK-NOT:   vector.ph:
+; CHECK:       }
+;
+define i32 @PR27246() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ]
+  %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ]
+  br label %for.cond1
+
+for.cond.cleanup:
+  %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ]
+  ret i32 %e.1.lcssa.lcssa
+
+for.cond1:
+  %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
+  %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
+  %cmp2 = icmp sgt i32 %k.0, 1
+  %dec = add nsw i32 %k.0, -1
+  br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
+
+for.cond.cleanup3:
+  %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ]
+  %inc = add nuw nsw i32 %i.016, 1
+  %exitcond = icmp eq i32 %inc, 49
+  br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader
+}
+
+; UNROLL-NO-IC-LABEL: @PR30183(
+; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC:         [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3
+; UNROLL-NO-IC-NEXT:    br label %vector.body
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], %vector.ph ], [ [[TMP42:%.*]], %vector.body ]
+; UNROLL-NO-IC:         [[TMP27:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP27]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP28]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP29]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP30]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP32]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP33]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP42]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP34]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP38]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; UNROLL-NO-IC:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @PR30183(i32 %pre_load, i32* %a, i32* %b, i64 %n) {
+entry:
+  br label %scalar.body
+
+scalar.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
+  %tmp0 = phi i32 [ %pre_load, %entry ], [ %tmp2, %scalar.body ]
+  %i.next = add nuw nsw i64 %i, 2
+  %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i.next
+  %tmp2 = load i32, i32* %tmp1
+  %cond = icmp eq i64 %i.next,%n
+  br i1 %cond, label %for.end, label %scalar.body
+
+for.end:
+  ret void
+}
+
+; UNROLL-NO-IC-LABEL: @constant_folded_previous_value(
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC:         [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 undef, i64 undef, i64 undef, i64 0>, %vector.ph ], [ <i64 1, i64 1, i64 1, i64 1>, %vector.body ]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @constant_folded_previous_value() {
+entry:
+  br label %scalar.body
+
+scalar.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
+  %tmp2 = phi i64 [ 0, %entry ], [ %tmp3, %scalar.body ]
+  %tmp3 = add i64 0, 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, undef
+  br i1 %cond, label %for.end, label %scalar.body
+
+for.end:
+  ret void
+}
+
+; We vectorize this first order recurrence, by generating two
+; extracts for the phi `val.phi` - one at the last index and 
+; another at the second last index. We need these 2 extracts because 
+; the first order recurrence phi is used outside the loop, so we require the phi
+; itself and not its update (addx).
+; UNROLL-NO-IC-LABEL: extract_second_last_iteration
+; UNROLL-NO-IC: vector.body
+; UNROLL-NO-IC:   %step.add = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
+; UNROLL-NO-IC:   %[[L1:.+]] = add <4 x i32> %vec.ind, %broadcast.splat
+; UNROLL-NO-IC:   %[[L2:.+]] = add <4 x i32> %step.add, %broadcast.splat
+; UNROLL-NO-IC:   %index.next = add i32 %index, 8
+; UNROLL-NO-IC:   icmp eq i32 %index.next, 96
+; UNROLL-NO-IC: middle.block
+; UNROLL-NO-IC:   icmp eq i32 96, 96
+; UNROLL-NO-IC:   %vector.recur.extract = extractelement <4 x i32> %[[L2]], i32 3
+; UNROLL-NO-IC:   %vector.recur.extract.for.phi = extractelement <4 x i32> %[[L2]], i32 2
+; UNROLL-NO-IC: for.end
+; UNROLL-NO-IC:   %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
+; Check the case when unrolled but not vectorized.
+; UNROLL-NO-VF-LABEL: extract_second_last_iteration
+; UNROLL-NO-VF: vector.body:
+; UNROLL-NO-VF:   %induction = add i32 %index, 0
+; UNROLL-NO-VF:   %induction1 = add i32 %index, 1
+; UNROLL-NO-VF:   %[[L1:.+]] = add i32 %induction, %x
+; UNROLL-NO-VF:   %[[L2:.+]] = add i32 %induction1, %x
+; UNROLL-NO-VF:   %index.next = add i32 %index, 2
+; UNROLL-NO-VF:   icmp eq i32 %index.next, 96
+; UNROLL-NO-VF: for.end:
+; UNROLL-NO-VF:   %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %[[L1]], %middle.block ]
+define i32 @extract_second_last_iteration(i32* %cval, i32 %x)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ]
+  %inc = add i32 %inc.phi, 1
+  %bc = zext i32 %inc.phi to i64
+  %addx = add i32 %inc.phi, %x
+  %cmp = icmp eq i32 %inc.phi, 95
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %val.phi
+}
+
+; We vectorize this first order recurrence, with a set of insertelements for
+; each unrolled part. Make sure these insertelements are generated in-order,
+; because the shuffle of the first order recurrence will be added after the
+; insertelement of the last part UF - 1, assuming the latter appears after the
+; insertelements of all other parts.
+;
+; int PR33613(double *b, double j, int d) {
+;   int a = 0;
+;   for(int i = 0; i < 10240; i++, b+=25) {
+;     double f = b[d]; // Scalarize to form insertelements
+;     if (j * f)
+;       a++;
+;     j = f;
+;   }
+;   return a;
+; }
+;
+; UNROLL-NO-IC-LABEL: @PR33613(
+; UNROLL-NO-IC:     vector.body:
+; UNROLL-NO-IC:       [[VECTOR_RECUR:%.*]] = phi <4 x double>
+; UNROLL-NO-IC:       shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> {{.*}}, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:  shufflevector <4 x double> {{.*}}, <4 x double> {{.*}}, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NOT:   insertelement <4 x double>
+; UNROLL-NO-IC:     middle.block:
+;
+define i32 @PR33613(double* %b, double %j, i32 %d) {
+entry:
+  %idxprom = sext i32 %d to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %a.1.lcssa = phi i32 [ %a.1, %for.body ]
+  ret i32 %a.1.lcssa
+
+for.body:
+  %b.addr.012 = phi double* [ %b, %entry ], [ %add.ptr, %for.body ]
+  %i.011 = phi i32 [ 0, %entry ], [ %inc1, %for.body ]
+  %a.010 = phi i32 [ 0, %entry ], [ %a.1, %for.body ]
+  %j.addr.09 = phi double [ %j, %entry ], [ %0, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b.addr.012, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %mul = fmul double %j.addr.09, %0
+  %tobool = fcmp une double %mul, 0.000000e+00
+  %inc = zext i1 %tobool to i32
+  %a.1 = add nsw i32 %a.010, %inc
+  %inc1 = add nuw nsw i32 %i.011, 1
+  %add.ptr = getelementptr inbounds double, double* %b.addr.012, i64 25
+  %exitcond = icmp eq i32 %inc1, 10240
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; void sink_after(short *a, int n, int *b) {
+;   for(int i = 0; i < n; i++)
+;     b[i] = (a[i] * a[i + 1]);
+; }
+;
+; SINK-AFTER-LABEL: sink_after
+; Check that the sext sank after the load in the vector loop.
+; SINK-AFTER: vector.body
+; SINK-AFTER:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
+; SINK-AFTER:   %wide.load = load <4 x i16>
+; SINK-AFTER:   %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER:   %[[VCONV:.+]] = sext <4 x i16> %[[VSHUF]] to <4 x i32>
+; SINK-AFTER:   %[[VCONV3:.+]] = sext <4 x i16> %wide.load to <4 x i32>
+; SINK-AFTER:   mul nsw <4 x i32> %[[VCONV3]], %[[VCONV]]
+;
+define void @sink_after(i16* %a, i32* %b, i64 %n) {
+entry:
+  %.pre = load i16, i16* %a
+  br label %for.body
+
+for.body:
+  %0 = phi i16 [ %.pre, %entry ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sext i16 %0 to i32
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i16, i16* %a, i64 %indvars.iv.next
+  %1 = load i16, i16* %arrayidx2
+  %conv3 = sext i16 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx5
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; PR34711: given three consecutive instructions such that the first will be
+; widened, the second is a cast that will be widened and needs to sink after the
+; third, and the third is a first-order-recurring load that will be replicated
+; instead of widened. Although the cast and the first instruction will both be
+; widened, and are originally adjacent to each other, make sure the replicated
+; load ends up appearing between them.
+;
+; void PR34711(short[2] *a, int *b, int *c, int n) {
+;   for(int i = 0; i < n; i++) {
+;     c[i] = 7;
+;     b[i] = (a[i][0] * a[i][1]);
+;   }
+; }
+;
+; SINK-AFTER-LABEL: @PR34711
+; Check that the sext sank after the load in the vector loop.
+; SINK-AFTER: vector.body
+; SINK-AFTER:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ {{.*}}, %vector.body ]
+; SINK-AFTER:   %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %{{.*}}, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER:   %[[VCONV:.+]] = sext <4 x i16> %[[VSHUF]] to <4 x i32>
+; SINK-AFTER:   %[[VCONV3:.+]] = sext <4 x i16> {{.*}} to <4 x i32>
+; SINK-AFTER:   mul nsw <4 x i32> %[[VCONV3]], %[[VCONV]]
+;
+define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
+entry:
+  %pre.index = getelementptr inbounds [2 x i16], [2 x i16]* %a, i64 0, i64 0
+  %.pre = load i16, i16* %pre.index
+  br label %for.body
+
+for.body:
+  %0 = phi i16 [ %.pre, %entry ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arraycidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %cur.index = getelementptr inbounds [2 x i16], [2 x i16]* %a, i64 %indvars.iv, i64 1
+  store i32 7, i32* %arraycidx   ; 1st instruction, to be widened.
+  %conv = sext i16 %0 to i32     ; 2nd, cast to sink after third.
+  %1 = load i16, i16* %cur.index ; 3rd, first-order-recurring load not widened.
+  %conv3 = sext i16 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; void no_sink_after(short *a, int n, int *b) {
+;   for(int i = 0; i < n; i++)
+;     b[i] = ((a[i] + 2) * a[i + 1]);
+; }
+;
+; NO-SINK-AFTER-LABEL: no_sink_after
+; NO-SINK-AFTER-NOT:   vector.ph:
+; NO-SINK-AFTER:       }
+;
+define void @no_sink_after(i16* %a, i32* %b, i64 %n) {
+entry:
+  %.pre = load i16, i16* %a
+  br label %for.body
+
+for.body:
+  %0 = phi i16 [ %.pre, %entry ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sext i16 %0 to i32
+  %add = add nsw i32 %conv, 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i16, i16* %a, i64 %indvars.iv.next
+  %1 = load i16, i16* %arrayidx2
+  %conv3 = sext i16 %1 to i32
+  %mul = mul nsw i32 %add, %conv3
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx5
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/flags.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/flags.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/flags.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/flags.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,78 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @flags1(
+;CHECK: load <4 x i32>
+;CHECK: mul nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags1(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = mul nsw i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+;CHECK-LABEL: @flags2(
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags2(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = mul i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+; Make sure we copy fast math flags and use them for the final reduction.
+; CHECK-LABEL: fast_math
+; CHECK: load <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: br
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+define float @fast_math(float* noalias %s) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %q.04 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %s, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd fast float %q.04, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  %add.lcssa = phi float [ %add, %for.body ]
+  ret float %add.lcssa
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,340 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL1 %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL2 %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -dce -instcombine -S | FileCheck --check-prefix VEC1_INTERL2 %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -dce -simplifycfg -instcombine -simplifycfg -keep-loops=false -S | FileCheck --check-prefix VEC2_INTERL1_PRED_STORE %s
+
+ at fp_inc = common global float 0.000000e+00, align 4
+
+;void fp_iv_loop1(float init, float * __restrict__ A, int N) {
+;  float x = init;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x -= fp_inc;
+;  }
+;}
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop1(
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:    [[INDUCTION4:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]]
+; VEC4_INTERL1-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION4]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP9]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT6]]
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+; VEC4_INTERL2-LABEL: @fp_iv_loop1(
+; VEC4_INTERL2:       vector.ph:
+; VEC4_INTERL2:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT4]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL2-NEXT:    [[INDUCTION5:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]]
+; VEC4_INTERL2-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    br label %vector.body
+; VEC4_INTERL2:       vector.body:
+; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION5]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT7]]
+; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL2-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
+; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP10]], align 4
+; VEC4_INTERL2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
+; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP12]], align 4
+; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT7]]
+; VEC4_INTERL2:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+; VEC1_INTERL2-LABEL: @fp_iv_loop1(
+; VEC1_INTERL2:       vector.ph:
+; VEC1_INTERL2:         br label %vector.body
+; VEC1_INTERL2:       vector.body:
+; VEC1_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC1_INTERL2-NEXT:    [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1
+; VEC1_INTERL2-NEXT:    [[TMP6:%.*]] = sitofp i64 [[INDEX]] to float
+; VEC1_INTERL2-NEXT:    [[TMP7:%.*]] = fmul fast float %fpinc, [[TMP6]]
+; VEC1_INTERL2-NEXT:    [[FP_OFFSET_IDX:%.*]] = fsub fast float %init, [[TMP7]]
+; VEC1_INTERL2-NEXT:    [[TMP8:%.*]] = fsub fast float [[FP_OFFSET_IDX]], %fpinc
+; VEC1_INTERL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC1_INTERL2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDUCTION2]]
+; VEC1_INTERL2-NEXT:    store float [[FP_OFFSET_IDX]], float* [[TMP9]], align 4
+; VEC1_INTERL2-NEXT:    store float [[TMP8]], float* [[TMP10]], align 4
+; VEC1_INTERL2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VEC1_INTERL2:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @fp_iv_loop1(float %init, float* noalias nocapture %A, i32 %N) #1 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %fpinc = load float, float* @fp_inc, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %x.05 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.05, float* %arrayidx, align 4
+  %add = fsub fast float %x.05, %fpinc
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+;void fp_iv_loop2(float init, float * __restrict__ A, int N) {
+;  float x = init;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop2(
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
+; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION2]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ %init, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd fast float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+;void fp_iv_loop3(float init, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int N) {
+;  int i = 0;
+;  float x = init;
+;  float y = 0.1;
+;  for (; i < N; ++i) {
+;    A[i] = x;
+;    x += fp_inc;
+;    y -= 0.5;
+;    B[i] = x + y;
+;    C[i] = y;
+;  }
+;}
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop3(
+; VEC4_INTERL1:       for.body.lr.ph:
+; VEC4_INTERL1:         [[TMP0:%.*]] = load float, float* @fp_inc, align 4
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT6]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:    [[INDUCTION7:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP7]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP0]], 4.000000e+00
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> undef, float [[TMP8]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
+; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT12]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 0x3FB99999A0000000, float 0xBFD99999A0000000, float 0xBFECCCCCC0000000, float 0xBFF6666660000000>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND10]], <4 x float>* [[TMP13]], align 4
+; VEC4_INTERL1-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT13]]
+; VEC4_INTERL1-NEXT:    [[TMP15:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
+; VEC4_INTERL1-NEXT:    [[TMP16:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP14]]
+; VEC4_INTERL1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* %B, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[TMP16]], <4 x float>* [[TMP18]], align 4
+; VEC4_INTERL1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* %C, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[TMP20]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00>
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT11]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]]
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 {
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load float, float* @fp_inc, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %y.012 = phi float [ 0x3FB99999A0000000, %for.body.lr.ph ], [ %conv1, %for.body ]
+  %x.011 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.011, float* %arrayidx, align 4
+  %add = fadd fast float %x.011, %0
+  %conv1 = fadd fast float %y.012, -5.000000e-01
+  %add2 = fadd fast float %conv1, %add
+  %arrayidx4 = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  store float %add2, float* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds float, float* %C, i64 %indvars.iv
+  store float %conv1, float* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 
+  br label %for.end
+
+for.end: 
+  ret void
+}
+
+; Start and step values are constants. There is no 'fmul' operation in this case
+;void fp_iv_loop4(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop4(
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1:         br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd fast float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar(
+; VEC2_INTERL1_PRED_STORE:       vector.body:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>*
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP4:%.*]] = fcmp fast oeq <2 x float> [[WIDE_LOAD]], zeroinitializer
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_IF]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP1]], float* [[TMP2]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_CONTINUE]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_IF6]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 1
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* %A, i64 [[TMP10]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP9]], float* [[TMP11]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_CONTINUE7]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VEC2_INTERL1_PRED_STORE:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @non_primary_iv_float_scalar(float* %A, i64 %N) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.inc ], [ 0, %entry ]
+  %j = phi float [ %j.next, %for.inc ], [ 0.0, %entry ]
+  %tmp0 = getelementptr inbounds float, float* %A, i64 %i
+  %tmp1 = load float, float* %tmp0, align 4
+  %tmp2 = fcmp fast oeq float %tmp1, 0.0
+  br i1 %tmp2, label %if.pred, label %for.inc
+
+if.pred:
+  store float %j, float* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = fadd fast float %j, 1.0
+  %cond = icmp slt i64 %i.next, %N
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/float-reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/float-reduction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/float-reduction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/float-reduction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+;CHECK-LABEL: @foo(
+;CHECK: fadd fast <4 x float>
+;CHECK: ret
+define float @foo(float* nocapture %A, i32* nocapture %n) nounwind uwtable readonly ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.04 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd fast float %sum.04, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret float %add
+}
+
+;CHECK-LABEL: @foosub(
+;CHECK: fsub fast <4 x float>
+;CHECK: ret
+define float @foosub(float* nocapture %A, i32* nocapture %n) nounwind uwtable readonly ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.04 = phi float [ 0.000000e+00, %entry ], [ %sub, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %sub = fsub fast float %sum.04, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret float %sub
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/followup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/followup.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/followup.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/followup.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,43 @@
+; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s
+;
+; Check that the followup loop attributes are applied.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define void @followup(i32* nocapture %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !3, !4, !5}
+!3 = !{!"llvm.loop.vectorize.followup_vectorized", !{!"FollowupVectorized"}}
+!4 = !{!"llvm.loop.vectorize.followup_epilogue", !{!"FollowupEpilogue"}}
+!5 = !{!"llvm.loop.vectorize.followup_all", !{!"FollowupAll"}}
+
+
+; CHECK-LABEL @followup(
+
+; CHECK-LABEL: vector.body:
+; CHECK: br i1 %13, label %middle.block, label %vector.body, !llvm.loop ![[LOOP_VECTOR:[0-9]+]]
+; CHECK-LABEL: for.body:
+; CHECK: br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop ![[LOOP_EPILOGUE:[0-9]+]]
+
+; CHECK: ![[LOOP_VECTOR]] = distinct !{![[LOOP_VECTOR]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOWUP_VECTORIZED:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
+; CHECK: ![[FOLLOWUP_VECTORIZED:[0-9]+]] = !{!"FollowupVectorized"}
+; CHECK: ![[LOOP_EPILOGUE]] = distinct !{![[LOOP_EPILOGUE]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_EPILOGUE:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_EPILOGUE]] = !{!"FollowupEpilogue"}

Added: llvm/trunk/test/Transforms/LoopVectorize/funcall.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/funcall.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/funcall.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/funcall.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we can vectorize loops with functions to math library functions.
+; They might read the rounding mode but we are only vectorizing loops that
+; contain a limited set of function calls and none of them sets the rounding
+; mode, so vectorizing them is safe.
+
+; CHECK-LABEL: @test(
+; CHECK: <2 x double>
+
+define void @test(double* %d, double %t) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %d, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %1 = tail call double @llvm.pow.f64(double %0, double %t)
+  store double %1, double* %arrayidx, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+declare double @llvm.pow.f64(double, double)

Added: llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,685 @@
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at b = common global [2048 x i32] zeroinitializer, align 16
+ at c = common global [2048 x i32] zeroinitializer, align 16
+ at a = common global [2048 x i32] zeroinitializer, align 16
+ at G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+ at ub = common global [1024 x i32] zeroinitializer, align 16
+ at uc = common global [1024 x i32] zeroinitializer, align 16
+ at d = common global [2048 x i32] zeroinitializer, align 16
+ at fa = common global [1024 x float] zeroinitializer, align 16
+ at fb = common global [1024 x float] zeroinitializer, align 16
+ at ic = common global [1024 x i32] zeroinitializer, align 16
+ at da = common global [1024 x float] zeroinitializer, align 16
+ at db = common global [1024 x float] zeroinitializer, align 16
+ at dc = common global [1024 x float] zeroinitializer, align 16
+ at dd = common global [1024 x float] zeroinitializer, align 16
+ at dj = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @example1(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example1(
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example2(
+;CHECK: store <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example2(
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32, i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32, i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+;CHECK-LABEL: @example3(
+;CHECK: <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example3(
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32, i32* %.023, i64 1
+  %4 = load i32, i32* %.023, align 16
+  %5 = getelementptr inbounds i32, i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+;CHECK-LABEL: @example4(
+;CHECK: load <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example4(
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: ret void
+define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = add nsw i32 %n, -1
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %.preheader4, label %.lr.ph10
+
+.preheader4:                                      ; preds = %0
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %.lr.ph6, label %._crit_edge
+
+.lr.ph10:                                         ; preds = %0, %.lr.ph10
+  %4 = phi i32 [ %9, %.lr.ph10 ], [ %1, %0 ]
+  %.018 = phi i32* [ %8, %.lr.ph10 ], [ %p, %0 ]
+  %.027 = phi i32* [ %5, %.lr.ph10 ], [ %q, %0 ]
+  %5 = getelementptr inbounds i32, i32* %.027, i64 1
+  %6 = load i32, i32* %.027, align 16
+  %7 = add nsw i32 %6, 5
+  %8 = getelementptr inbounds i32, i32* %.018, i64 1
+  store i32 %7, i32* %.018, align 16
+  %9 = add nsw i32 %4, -1
+  %10 = icmp eq i32 %4, 0
+  br i1 %10, label %._crit_edge, label %.lr.ph10
+
+.preheader:                                       ; preds = %.lr.ph6
+  br i1 %3, label %.lr.ph, label %._crit_edge
+
+.lr.ph6:                                          ; preds = %.preheader4, %.lr.ph6
+  %indvars.iv11 = phi i64 [ %indvars.iv.next12, %.lr.ph6 ], [ 0, %.preheader4 ]
+  %indvars.iv.next12 = add i64 %indvars.iv11, 1
+  %11 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv.next12
+  %12 = load i32, i32* %11, align 4
+  %13 = add nsw i64 %indvars.iv11, 3
+  %14 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %13
+  %15 = load i32, i32* %14, align 4
+  %16 = add nsw i32 %15, %12
+  %17 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv11
+  store i32 %16, i32* %17, align 4
+  %lftr.wideiv13 = trunc i64 %indvars.iv.next12 to i32
+  %exitcond14 = icmp eq i32 %lftr.wideiv13, %1
+  br i1 %exitcond14, label %.preheader, label %.lr.ph6
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.preheader ]
+  %18 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %19 = load i32, i32* %18, align 4
+  %20 = icmp sgt i32 %19, 4
+  %21 = select i1 %20, i32 4, i32 0
+  %22 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  store i32 %21, i32* %22, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %1
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph10, %.preheader4, %.lr.ph, %.preheader
+  ret void
+}
+
+;CHECK-LABEL: @example8(
+;CHECK: store <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example8(
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example8(i32 %x) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %3, %0
+  %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %3 ]
+  br label %1
+
+; <label>:1                                       ; preds = %1, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [32 x [1024 x i32]], [32 x [1024 x i32]]* @G, i64 0, i64 %indvars.iv3, i64 %indvars.iv
+  store i32 %x, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %3, label %1
+
+; <label>:3                                       ; preds = %1
+  %indvars.iv.next4 = add i64 %indvars.iv3, 1
+  %lftr.wideiv5 = trunc i64 %indvars.iv.next4 to i32
+  %exitcond6 = icmp eq i32 %lftr.wideiv5, 32
+  br i1 %exitcond6, label %4, label %.preheader
+
+; <label>:4                                       ; preds = %3
+  ret void
+}
+
+;CHECK-LABEL: @example9(
+;CHECK: phi <4 x i32>
+;CHECK: ret i32
+define i32 @example9() nounwind uwtable readonly ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %diff.01 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @ub, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [1024 x i32], [1024 x i32]* @uc, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add i32 %3, %diff.01
+  %7 = sub i32 %6, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i32 %7
+}
+
+;CHECK-LABEL: @example10a(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: load <4 x i16>
+;CHECK: add <4 x i16>
+;CHECK: store <4 x i16>
+;CHECK: ret void
+define void @example10a(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, i32* %ib, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %ic, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %8 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+  %9 = load i16, i16* %8, align 2
+  %10 = getelementptr inbounds i16, i16* %sc, i64 %indvars.iv
+  %11 = load i16, i16* %10, align 2
+  %12 = add i16 %11, %9
+  %13 = getelementptr inbounds i16, i16* %sa, i64 %indvars.iv
+  store i16 %12, i16* %13, align 2
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %14, label %1
+
+; <label>:14                                      ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example10b(
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+  %3 = load i16, i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example11(
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: ret void
+define void @example11() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = shl nsw i64 %indvars.iv, 1
+  %3 = or i64 %2, 1
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %3
+  %5 = load i32, i32* %4, align 4
+  %6 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %3
+  %7 = load i32, i32* %6, align 4
+  %8 = mul nsw i32 %7, %5
+  %9 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %2
+  %10 = load i32, i32* %9, align 8
+  %11 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %2
+  %12 = load i32, i32* %11, align 8
+  %13 = mul nsw i32 %12, %10
+  %14 = sub nsw i32 %8, %13
+  %15 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %14, i32* %15, align 4
+  %16 = mul nsw i32 %7, %10
+  %17 = mul nsw i32 %12, %5
+  %18 = add nsw i32 %17, %16
+  %19 = getelementptr inbounds [2048 x i32], [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  store i32 %18, i32* %19, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %20, label %1
+
+; <label>:20                                      ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example12(
+;CHECK: %vec.ind1 = phi <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example12() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example13(
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @example13(i32** nocapture %A, i32** nocapture %B, i32* nocapture %out) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %14, %0
+  %indvars.iv4 = phi i64 [ 0, %0 ], [ %indvars.iv.next5, %14 ]
+  %1 = getelementptr inbounds i32*, i32** %A, i64 %indvars.iv4
+  %2 = load i32*, i32** %1, align 8
+  %3 = getelementptr inbounds i32*, i32** %B, i64 %indvars.iv4
+  %4 = load i32*, i32** %3, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %.preheader, %5
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %5 ]
+  %diff.02 = phi i32 [ 0, %.preheader ], [ %11, %5 ]
+  %6 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv
+  %7 = load i32, i32* %6, align 4
+  %8 = getelementptr inbounds i32, i32* %4, i64 %indvars.iv
+  %9 = load i32, i32* %8, align 4
+  %10 = add i32 %7, %diff.02
+  %11 = sub i32 %10, %9
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %12 = trunc i64 %indvars.iv.next to i32
+  %13 = icmp slt i32 %12, 1024
+  br i1 %13, label %5, label %14
+
+; <label>:14                                      ; preds = %5
+  %15 = getelementptr inbounds i32, i32* %out, i64 %indvars.iv4
+  store i32 %11, i32* %15, align 4
+  %indvars.iv.next5 = add i64 %indvars.iv4, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next5 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 32
+  br i1 %exitcond, label %16, label %.preheader
+
+; <label>:16                                      ; preds = %14
+  ret void
+}
+
+; Can vectorize.
+;CHECK-LABEL: @example14(
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @example14(i32** nocapture %in, i32** nocapture %coeff, i32* nocapture %out) nounwind uwtable ssp {
+.preheader3:
+  br label %.preheader
+
+.preheader:                                       ; preds = %11, %.preheader3
+  %indvars.iv7 = phi i64 [ 0, %.preheader3 ], [ %indvars.iv.next8, %11 ]
+  %sum.05 = phi i32 [ 0, %.preheader3 ], [ %10, %11 ]
+  br label %0
+
+; <label>:0                                       ; preds = %0, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %0 ]
+  %sum.12 = phi i32 [ %sum.05, %.preheader ], [ %10, %0 ]
+  %1 = getelementptr inbounds i32*, i32** %in, i64 %indvars.iv
+  %2 = load i32*, i32** %1, align 8
+  %3 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv7
+  %4 = load i32, i32* %3, align 4
+  %5 = getelementptr inbounds i32*, i32** %coeff, i64 %indvars.iv
+  %6 = load i32*, i32** %5, align 8
+  %7 = getelementptr inbounds i32, i32* %6, i64 %indvars.iv7
+  %8 = load i32, i32* %7, align 4
+  %9 = mul nsw i32 %8, %4
+  %10 = add nsw i32 %9, %sum.12
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %11, label %0
+
+; <label>:11                                      ; preds = %0
+  %indvars.iv.next8 = add i64 %indvars.iv7, 1
+  %lftr.wideiv9 = trunc i64 %indvars.iv.next8 to i32
+  %exitcond10 = icmp eq i32 %lftr.wideiv9, 32
+  br i1 %exitcond10, label %.preheader3.1, label %.preheader
+
+.preheader3.1:                                    ; preds = %11
+  store i32 %10, i32* %out, align 4
+  br label %.preheader.1
+
+.preheader.1:                                     ; preds = %24, %.preheader3.1
+  %indvars.iv7.1 = phi i64 [ 0, %.preheader3.1 ], [ %indvars.iv.next8.1, %24 ]
+  %sum.05.1 = phi i32 [ 0, %.preheader3.1 ], [ %23, %24 ]
+  br label %12
+
+; <label>:12                                      ; preds = %12, %.preheader.1
+  %indvars.iv.1 = phi i64 [ 0, %.preheader.1 ], [ %13, %12 ]
+  %sum.12.1 = phi i32 [ %sum.05.1, %.preheader.1 ], [ %23, %12 ]
+  %13 = add nsw i64 %indvars.iv.1, 1
+  %14 = getelementptr inbounds i32*, i32** %in, i64 %13
+  %15 = load i32*, i32** %14, align 8
+  %16 = getelementptr inbounds i32, i32* %15, i64 %indvars.iv7.1
+  %17 = load i32, i32* %16, align 4
+  %18 = getelementptr inbounds i32*, i32** %coeff, i64 %indvars.iv.1
+  %19 = load i32*, i32** %18, align 8
+  %20 = getelementptr inbounds i32, i32* %19, i64 %indvars.iv7.1
+  %21 = load i32, i32* %20, align 4
+  %22 = mul nsw i32 %21, %17
+  %23 = add nsw i32 %22, %sum.12.1
+  %lftr.wideiv.1 = trunc i64 %13 to i32
+  %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 1024
+  br i1 %exitcond.1, label %24, label %12
+
+; <label>:24                                      ; preds = %12
+  %indvars.iv.next8.1 = add i64 %indvars.iv7.1, 1
+  %lftr.wideiv9.1 = trunc i64 %indvars.iv.next8.1 to i32
+  %exitcond10.1 = icmp eq i32 %lftr.wideiv9.1, 32
+  br i1 %exitcond10.1, label %.preheader3.2, label %.preheader.1
+
+.preheader3.2:                                    ; preds = %24
+  %25 = getelementptr inbounds i32, i32* %out, i64 1
+  store i32 %23, i32* %25, align 4
+  br label %.preheader.2
+
+.preheader.2:                                     ; preds = %38, %.preheader3.2
+  %indvars.iv7.2 = phi i64 [ 0, %.preheader3.2 ], [ %indvars.iv.next8.2, %38 ]
+  %sum.05.2 = phi i32 [ 0, %.preheader3.2 ], [ %37, %38 ]
+  br label %26
+
+; <label>:26                                      ; preds = %26, %.preheader.2
+  %indvars.iv.2 = phi i64 [ 0, %.preheader.2 ], [ %indvars.iv.next.2, %26 ]
+  %sum.12.2 = phi i32 [ %sum.05.2, %.preheader.2 ], [ %37, %26 ]
+  %27 = add nsw i64 %indvars.iv.2, 2
+  %28 = getelementptr inbounds i32*, i32** %in, i64 %27
+  %29 = load i32*, i32** %28, align 8
+  %30 = getelementptr inbounds i32, i32* %29, i64 %indvars.iv7.2
+  %31 = load i32, i32* %30, align 4
+  %32 = getelementptr inbounds i32*, i32** %coeff, i64 %indvars.iv.2
+  %33 = load i32*, i32** %32, align 8
+  %34 = getelementptr inbounds i32, i32* %33, i64 %indvars.iv7.2
+  %35 = load i32, i32* %34, align 4
+  %36 = mul nsw i32 %35, %31
+  %37 = add nsw i32 %36, %sum.12.2
+  %indvars.iv.next.2 = add i64 %indvars.iv.2, 1
+  %lftr.wideiv.2 = trunc i64 %indvars.iv.next.2 to i32
+  %exitcond.2 = icmp eq i32 %lftr.wideiv.2, 1024
+  br i1 %exitcond.2, label %38, label %26
+
+; <label>:38                                      ; preds = %26
+  %indvars.iv.next8.2 = add i64 %indvars.iv7.2, 1
+  %lftr.wideiv9.2 = trunc i64 %indvars.iv.next8.2 to i32
+  %exitcond10.2 = icmp eq i32 %lftr.wideiv9.2, 32
+  br i1 %exitcond10.2, label %.preheader3.3, label %.preheader.2
+
+.preheader3.3:                                    ; preds = %38
+  %39 = getelementptr inbounds i32, i32* %out, i64 2
+  store i32 %37, i32* %39, align 4
+  br label %.preheader.3
+
+.preheader.3:                                     ; preds = %52, %.preheader3.3
+  %indvars.iv7.3 = phi i64 [ 0, %.preheader3.3 ], [ %indvars.iv.next8.3, %52 ]
+  %sum.05.3 = phi i32 [ 0, %.preheader3.3 ], [ %51, %52 ]
+  br label %40
+
+; <label>:40                                      ; preds = %40, %.preheader.3
+  %indvars.iv.3 = phi i64 [ 0, %.preheader.3 ], [ %indvars.iv.next.3, %40 ]
+  %sum.12.3 = phi i32 [ %sum.05.3, %.preheader.3 ], [ %51, %40 ]
+  %41 = add nsw i64 %indvars.iv.3, 3
+  %42 = getelementptr inbounds i32*, i32** %in, i64 %41
+  %43 = load i32*, i32** %42, align 8
+  %44 = getelementptr inbounds i32, i32* %43, i64 %indvars.iv7.3
+  %45 = load i32, i32* %44, align 4
+  %46 = getelementptr inbounds i32*, i32** %coeff, i64 %indvars.iv.3
+  %47 = load i32*, i32** %46, align 8
+  %48 = getelementptr inbounds i32, i32* %47, i64 %indvars.iv7.3
+  %49 = load i32, i32* %48, align 4
+  %50 = mul nsw i32 %49, %45
+  %51 = add nsw i32 %50, %sum.12.3
+  %indvars.iv.next.3 = add i64 %indvars.iv.3, 1
+  %lftr.wideiv.3 = trunc i64 %indvars.iv.next.3 to i32
+  %exitcond.3 = icmp eq i32 %lftr.wideiv.3, 1024
+  br i1 %exitcond.3, label %52, label %40
+
+; <label>:52                                      ; preds = %40
+  %indvars.iv.next8.3 = add i64 %indvars.iv7.3, 1
+  %lftr.wideiv9.3 = trunc i64 %indvars.iv.next8.3 to i32
+  %exitcond10.3 = icmp eq i32 %lftr.wideiv9.3, 32
+  br i1 %exitcond10.3, label %53, label %.preheader.3
+
+; <label>:53                                      ; preds = %52
+  %54 = getelementptr inbounds i32, i32* %out, i64 3
+  store i32 %51, i32* %54, align 4
+  ret void
+}
+
+;CHECK-LABEL: @example21(
+;CHECK: load <4 x i32>
+;CHECK: shufflevector {{.*}} <i32 3, i32 2, i32 1, i32 0>
+;CHECK: ret i32
+define i32 @example21(i32* nocapture %b, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i32 %n to i64
+  br label %3
+
+; <label>:3                                       ; preds = %.lr.ph, %3
+  %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
+  %a.02 = phi i32 [ 0, %.lr.ph ], [ %6, %3 ]
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %4 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %a.02
+  %7 = trunc i64 %indvars.iv.next to i32
+  %8 = icmp sgt i32 %7, 0
+  br i1 %8, label %3, label %._crit_edge
+
+._crit_edge:                                      ; preds = %3, %0
+  %a.0.lcssa = phi i32 [ 0, %0 ], [ %6, %3 ]
+  ret i32 %a.0.lcssa
+}
+
+;CHECK-LABEL: @example23(
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example24(
+;CHECK: shufflevector <4 x i16>
+;CHECK: ret void
+define void @example24(i16 signext %x, i16 signext %y) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float], [1024 x float]* @fa, i64 0, i64 %indvars.iv
+  %3 = load float, float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float], [1024 x float]* @fb, i64 0, i64 %indvars.iv
+  %5 = load float, float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %x.y = select i1 %6, i16 %x, i16 %y
+  %7 = sext i16 %x.y to i32
+  %8 = getelementptr inbounds [1024 x i32], [1024 x i32]* @ic, i64 0, i64 %indvars.iv
+  store i32 %7, i32* %8, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example25(
+;CHECK: and <4 x i1>
+;CHECK: zext <4 x i1>
+;CHECK: ret void
+define void @example25() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float], [1024 x float]* @da, i64 0, i64 %indvars.iv
+  %3 = load float, float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float], [1024 x float]* @db, i64 0, i64 %indvars.iv
+  %5 = load float, float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %7 = getelementptr inbounds [1024 x float], [1024 x float]* @dc, i64 0, i64 %indvars.iv
+  %8 = load float, float* %7, align 4
+  %9 = getelementptr inbounds [1024 x float], [1024 x float]* @dd, i64 0, i64 %indvars.iv
+  %10 = load float, float* %9, align 4
+  %11 = fcmp olt float %8, %10
+  %12 = and i1 %6, %11
+  %13 = zext i1 %12 to i32
+  %14 = getelementptr inbounds [1024 x i32], [1024 x i32]* @dj, i64 0, i64 %indvars.iv
+  store i32 %13, i32* %14, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %15, label %1
+
+; <label>:15                                      ; preds = %1
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,41 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4  < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Vectorization of loop with bitcast between GEP and load
+; Simplified source code:
+;void foo (double** __restrict__  in, bool * __restrict__ res) {
+;
+;  for (int i = 0; i < 4096; ++i)
+;    res[i] = ((unsigned long long)in[i] == 0);
+;}
+
+; CHECK-LABEL: @foo
+; CHECK: vector.body
+; CHECK:  %[[IV:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:  %[[v0:.+]] = getelementptr inbounds double*, double** %in, i64 %[[IV]]
+; CHECK:  %[[v1:.+]] = bitcast double** %[[v0]] to <4 x i64>*
+; CHECK:  %wide.load = load <4 x i64>, <4 x i64>* %[[v1]], align 8
+; CHECK:  icmp eq <4 x i64> %wide.load, zeroinitializer
+; CHECK:  br i1
+
+define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double*, double** %in, i64 %indvars.iv
+  %tmp53 = bitcast double** %arrayidx to i64*
+  %tmp54 = load i64, i64* %tmp53, align 8
+  %cmp1 = icmp eq i64 %tmp54, 0
+  %arrayidx3 = getelementptr inbounds i8, i8* %res, i64 %indvars.iv
+  %frombool = zext i1 %cmp1 to i8
+  store i8 %frombool, i8* %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/global_alias.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,1077 @@
+; RUN: opt < %s -O1 -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+
+%struct.anon = type { [100 x i32], i32, [100 x i32] }
+%struct.anon.0 = type { [100 x [100 x i32]], i32, [100 x [100 x i32]] }
+
+ at Foo = common global %struct.anon zeroinitializer, align 4
+ at Bar = common global %struct.anon.0 zeroinitializer, align 4
+
+ at PB = external global i32*
+ at PA = external global i32*
+
+
+;; === First, the tests that should always vectorize, whether statically or by adding run-time checks ===
+
+
+; /// Different objects, positive induction, constant distance
+; int noAlias01 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.B[i] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias01(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias01(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add, i32* %arrayidx1, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx2, align 4
+  ret i32 %7
+}
+
+; /// Different objects, positive induction with widening slide
+; int noAlias02 (int a) {
+;   int i;
+;   for (i=0; i<SIZE-10; i++)
+;     Foo.A[i] = Foo.B[i+10] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias02(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias02(i32 %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 90
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %add = add nsw i32 %1, 10
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %add
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add1 = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add1, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Different objects, positive induction with shortening slide
+; int noAlias03 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i+10] = Foo.B[i] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias03(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias03(i32 %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %add1 = add nsw i32 %4, 10
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add1
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Pointer access, positive stride, run-time check added
+; int noAlias04 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     *(PA+i) = *(PB+i) + a;
+;   return *(PA+a);
+; }
+; CHECK-LABEL: define i32 @noAlias04(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+;
+; TODO: This test vectorizes (with run-time check) on real targets with -O3)
+; Check why it's not being vectorized even when forcing vectorization
+
+define i32 @noAlias04(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32*, i32** @PB, align 4
+  %2 = load i32, i32* %i, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %1, i32 %2
+  %3 = load i32, i32* %add.ptr, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32*, i32** @PA, align 4
+  %6 = load i32, i32* %i, align 4
+  %add.ptr1 = getelementptr inbounds i32, i32* %5, i32 %6
+  store i32 %add, i32* %add.ptr1, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32*, i32** @PA, align 4
+  %9 = load i32, i32* %a.addr, align 4
+  %add.ptr2 = getelementptr inbounds i32, i32* %8, i32 %9
+  %10 = load i32, i32* %add.ptr2, align 4
+  ret i32 %10
+}
+
+; /// Different objects, positive induction, multi-array
+; int noAlias05 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][i] = Bar.B[N][i] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK-LABEL: define i32 @noAlias05(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias05(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %N, align 4
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 2), i32 0, i32 %2
+  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx, i32 0, i32 %1
+  %3 = load i32, i32* %arrayidx1, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32, i32* %i, align 4
+  %6 = load i32, i32* %N, align 4
+  %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx2, i32 0, i32 %5
+  store i32 %add, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32, i32* %a.addr, align 4
+  %9 = load i32, i32* %N, align 4
+  %arrayidx4 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx4, i32 0, i32 %8
+  %10 = load i32, i32* %arrayidx5, align 4
+  ret i32 %10
+}
+
+; /// Same objects, positive induction, multi-array, different sub-elements
+; int noAlias06 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][i] = Bar.A[N+1][i] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK-LABEL: define i32 @noAlias06(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias06(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %N, align 4
+  %add = add nsw i32 %2, 1
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %add
+  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx, i32 0, i32 %1
+  %3 = load i32, i32* %arrayidx1, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add2 = add nsw i32 %3, %4
+  %5 = load i32, i32* %i, align 4
+  %6 = load i32, i32* %N, align 4
+  %arrayidx3 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx3, i32 0, i32 %5
+  store i32 %add2, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32, i32* %a.addr, align 4
+  %9 = load i32, i32* %N, align 4
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx5, i32 0, i32 %8
+  %10 = load i32, i32* %arrayidx6, align 4
+  ret i32 %10
+}
+
+; /// Different objects, negative induction, constant distance
+; int noAlias07 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-1] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias07(
+; CHECK: store <4 x i32>
+; CHECK: ret
+define i32 @noAlias07(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 1
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+; /// Different objects, negative induction, shortening slide
+; int noAlias08 (int a) {
+;   int i;
+;   for (i=0; i<SIZE-10; i++)
+;     Foo.A[SIZE-i-1] = Foo.B[SIZE-i-10] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias08(
+; CHECK: load <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias08(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 90
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 10
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 1
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+; /// Different objects, negative induction, widening slide
+; int noAlias09 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-10] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias09(
+; CHECK: load <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias09(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 10
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+; /// Pointer access, negative stride, run-time check added
+; int noAlias10 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     *(PA+SIZE-i-1) = *(PB+SIZE-i-1) + a;
+;   return *(PA+a);
+; }
+; CHECK-LABEL: define i32 @noAlias10(
+; CHECK-NOT: sub {{.*}} <4 x i32>
+; CHECK: ret
+;
+; TODO: This test vectorizes (with run-time check) on real targets with -O3)
+; Check why it's not being vectorized even when forcing vectorization
+
+define i32 @noAlias10(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32*, i32** @PB, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %1, i32 100
+  %2 = load i32, i32* %i, align 4
+  %idx.neg = sub i32 0, %2
+  %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 %idx.neg
+  %add.ptr2 = getelementptr inbounds i32, i32* %add.ptr1, i32 -1
+  %3 = load i32, i32* %add.ptr2, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32*, i32** @PA, align 4
+  %add.ptr3 = getelementptr inbounds i32, i32* %5, i32 100
+  %6 = load i32, i32* %i, align 4
+  %idx.neg4 = sub i32 0, %6
+  %add.ptr5 = getelementptr inbounds i32, i32* %add.ptr3, i32 %idx.neg4
+  %add.ptr6 = getelementptr inbounds i32, i32* %add.ptr5, i32 -1
+  store i32 %add, i32* %add.ptr6, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32*, i32** @PA, align 4
+  %9 = load i32, i32* %a.addr, align 4
+  %add.ptr7 = getelementptr inbounds i32, i32* %8, i32 %9
+  %10 = load i32, i32* %add.ptr7, align 4
+  ret i32 %10
+}
+
+; /// Different objects, negative induction, multi-array
+; int noAlias11 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][SIZE-i-1] = Bar.B[N][SIZE-i-1] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK-LABEL: define i32 @noAlias11(
+; CHECK: store <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias11(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %2 = load i32, i32* %N, align 4
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 2), i32 0, i32 %2
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx, i32 0, i32 %sub1
+  %3 = load i32, i32* %arrayidx2, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32, i32* %i, align 4
+  %sub3 = sub nsw i32 100, %5
+  %sub4 = sub nsw i32 %sub3, 1
+  %6 = load i32, i32* %N, align 4
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx5, i32 0, i32 %sub4
+  store i32 %add, i32* %arrayidx6, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32, i32* %a.addr, align 4
+  %9 = load i32, i32* %N, align 4
+  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx7, i32 0, i32 %8
+  %10 = load i32, i32* %arrayidx8, align 4
+  ret i32 %10
+}
+
+; /// Same objects, negative induction, multi-array, different sub-elements
+; int noAlias12 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][SIZE-i-1] = Bar.A[N+1][SIZE-i-1] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK-LABEL: define i32 @noAlias12(
+; CHECK: store <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias12(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %2 = load i32, i32* %N, align 4
+  %add = add nsw i32 %2, 1
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %add
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx, i32 0, i32 %sub1
+  %3 = load i32, i32* %arrayidx2, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add3 = add nsw i32 %3, %4
+  %5 = load i32, i32* %i, align 4
+  %sub4 = sub nsw i32 100, %5
+  %sub5 = sub nsw i32 %sub4, 1
+  %6 = load i32, i32* %N, align 4
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx7 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx6, i32 0, i32 %sub5
+  store i32 %add3, i32* %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32, i32* %a.addr, align 4
+  %9 = load i32, i32* %N, align 4
+  %arrayidx8 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx9 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx8, i32 0, i32 %8
+  %10 = load i32, i32* %arrayidx9, align 4
+  ret i32 %10
+}
+
+; /// Same objects, positive induction, constant distance, just enough for vector size
+; int noAlias13 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.A[i+4] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias13(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias13(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %add = add nsw i32 %1, 4
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add1 = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add1, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Same objects, negative induction, constant distance, just enough for vector size
+; int noAlias14 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-1] = Foo.A[SIZE-i-5] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias14(
+; CHECK: load <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias14(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 5
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 1
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+
+;; === Now, the tests that we could vectorize with induction changes or run-time checks ===
+
+
+; /// Different objects, swapped induction, alias at the end
+; int mayAlias01 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @mayAlias01(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mayAlias01(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Different objects, swapped induction, alias at the beginning
+; int mayAlias02 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-1] = Foo.B[i] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @mayAlias02(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mayAlias02(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %4
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub1
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Pointer access, run-time check added
+; int mayAlias03 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     *(PA+i) = *(PB+SIZE-i-1) + a;
+;   return *(PA+a);
+; }
+; CHECK-LABEL: define i32 @mayAlias03(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mayAlias03(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32*, i32** @PB, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %1, i32 100
+  %2 = load i32, i32* %i, align 4
+  %idx.neg = sub i32 0, %2
+  %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 %idx.neg
+  %add.ptr2 = getelementptr inbounds i32, i32* %add.ptr1, i32 -1
+  %3 = load i32, i32* %add.ptr2, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32*, i32** @PA, align 4
+  %6 = load i32, i32* %i, align 4
+  %add.ptr3 = getelementptr inbounds i32, i32* %5, i32 %6
+  store i32 %add, i32* %add.ptr3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32*, i32** @PA, align 4
+  %9 = load i32, i32* %a.addr, align 4
+  %add.ptr4 = getelementptr inbounds i32, i32* %8, i32 %9
+  %10 = load i32, i32* %add.ptr4, align 4
+  ret i32 %10
+}
+
+
+;; === Finally, the tests that should only vectorize with care (or if we ignore undefined behaviour at all) ===
+
+
+; int mustAlias01 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i+10] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @mustAlias01(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mustAlias01(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %add2 = add nsw i32 %4, 10
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add2
+  store i32 %add, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx4, align 4
+  ret i32 %7
+}
+
+; int mustAlias02 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.B[SIZE-i-10] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @mustAlias02(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mustAlias02(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 10
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; int mustAlias03 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i+10] = Foo.B[SIZE-i-10] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @mustAlias03(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mustAlias03(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 10
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %add2 = add nsw i32 %4, 10
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add2
+  store i32 %add, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx4, align 4
+  ret i32 %7
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/hints-trans.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/hints-trans.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/hints-trans.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/hints-trans.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instsimplify -simplifycfg < %s | FileCheck %s
+; Note: -instsimplify -simplifycfg remove the (now dead) original loop, making
+; it easy to test that the llvm.loop.unroll.disable hint is still present.
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture %b) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 1, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+}
+
+; CHECK-LABEL: @foo
+; CHECK: = !{!"llvm.loop.unroll.disable"}
+
+attributes #0 = { norecurse nounwind uwtable }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.unroll.disable"}

Added: llvm/trunk/test/Transforms/LoopVectorize/hoist-loads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/hoist-loads.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/hoist-loads.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/hoist-loads.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,70 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at A = common global [1024 x float] zeroinitializer, align 16
+ at B = common global [1024 x float] zeroinitializer, align 16
+
+; Make sure we can vectorize in the presence of hoistable conditional loads.
+; CHECK-LABEL: @hoist_cond_load(
+; CHECK: load <2 x float>
+
+define void @hoist_cond_load() {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end9 ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx2, align 4
+  %cmp3 = fcmp oeq float %0, 0.000000e+00
+  br i1 %cmp3, label %if.end9, label %if.else
+
+if.else:
+  %1 = load float, float* %arrayidx, align 4
+  br label %if.end9
+
+if.end9:
+  %tmp.0 = phi float [ %1, %if.else ], [ 0.000000e+00, %for.body ]
+  store float %tmp.0, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; However, we can't hoist loads whose address we have not seen unconditionally
+; accessed. One wide load is fine, but not the second.
+; CHECK-LABEL: @dont_hoist_cond_load(
+; CHECK: load <2 x float>
+; CHECK-NOT: load <2 x float>
+
+define void @dont_hoist_cond_load() {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end9 ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx2, align 4
+  %cmp3 = fcmp oeq float %0, 0.000000e+00
+  br i1 %cmp3, label %if.end9, label %if.else
+
+if.else:
+  %1 = load float, float* %arrayidx, align 4
+  br label %if.end9
+
+if.end9:
+  %tmp.0 = phi float [ %1, %if.else ], [ 0.000000e+00, %for.body ]
+  store float %tmp.0, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/i8-induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/i8-induction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/i8-induction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/i8-induction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S
+; RUN: opt < %s -debugify -loop-vectorize -S | FileCheck %s --check-prefix=DEBUGLOC
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at a = common global i8 0, align 1
+ at b = common global i8 0, align 1
+
+define void @f() nounwind uwtable ssp {
+; Check that the induction phis and adds have debug location.
+;
+; DEBUGLOC-LABEL: vector.body:
+; DEBUGLOC:         %vec.ind = phi {{.*}}, !dbg ![[DbgLoc:[0-9]+]]
+; DEBUGLOC:         %vec.ind.next = add {{.*}}, !dbg ![[DbgLoc]]
+
+scalar.ph:
+  store i8 0, i8* inttoptr (i64 1 to i8*), align 1
+  %0 = load i8, i8* @a, align 1
+  br label %for.body
+
+for.body:
+  %mul16 = phi i8 [ 0, %scalar.ph ], [ %mul, %for.body ]              ; <------- i8 induction var.
+  %c.015 = phi i8 [ undef, %scalar.ph ], [ %conv8, %for.body ]
+  %conv2 = sext i8 %c.015 to i32
+  %tobool = icmp ne i8 %c.015, 0
+  %.sink = select i1 %tobool, i8 %c.015, i8 %0
+  %mul = mul i8 %mul16, %.sink
+  %add = add nsw i32 %conv2, 1
+  %conv8 = trunc i32 %add to i8
+  %sext = shl i32 %add, 24
+  %phitmp14 = icmp slt i32 %sext, 268435456
+  br i1 %phitmp14, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  store i8 %mul, i8* @b, align 1
+  ret void
+}
+
+; Check that the location of the new phi comes from %c.015 = phi i8
+; DEBUGLOC:         ![[DbgLoc]] = !DILocation(line: 5

Added: llvm/trunk/test/Transforms/LoopVectorize/icmp-uniforms.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/icmp-uniforms.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/icmp-uniforms.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/icmp-uniforms.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: more_than_one_use
+;
+; PR30627. Check that a compare instruction with more than one use is not
+; recognized as uniform and is vectorized.
+;
+; CHECK-NOT: Found uniform instruction: %cond = icmp slt i64 %i.next, %n
+; CHECK:     vector.body
+; CHECK:       %[[I:.+]] = add nuw nsw <4 x i64> %vec.ind, <i64 1, i64 1, i64 1, i64 1>
+; CHECK:       icmp slt <4 x i64> %[[I]], %broadcast.splat
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @more_than_one_use(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %r = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  %tmp0 = select i1 %cond, i64 %i.next, i64 0
+  %tmp1 = getelementptr inbounds i32, i32* %a, i64 %tmp0
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %r, %tmp2
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/if-conv-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/if-conv-crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/if-conv-crash.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/if-conv-crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,60 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define fastcc void @DD_dump() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %lor.lhs.false, label %if.end25
+
+lor.lhs.false:                                    ; preds = %entry
+  br i1 undef, label %if.end21, label %if.else
+
+if.else:                                          ; preds = %lor.lhs.false
+  br i1 undef, label %num_q.exit, label %while.body.i.preheader
+
+while.body.i.preheader:                           ; preds = %if.else
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %if.end.i, %while.body.i.preheader
+  switch i8 undef, label %if.end.i [
+    i8 39, label %if.then.i
+    i8 92, label %if.then.i
+  ]
+
+if.then.i:                                        ; preds = %while.body.i, %while.body.i
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %if.then.i, %while.body.i
+  br i1 undef, label %num_q.exit, label %while.body.i
+
+num_q.exit:                                       ; preds = %if.end.i, %if.else
+  unreachable
+
+if.end21:                                         ; preds = %lor.lhs.false
+  unreachable
+
+if.end25:                                         ; preds = %entry
+  ret void
+}
+
+; PR15990
+; We can have basic blocks with single entry PHI nodes.
+define void @single_entry_phi(i32* %a, i32 *%b) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %inc10 = phi i32 [ 0, %entry ], [ %inc, %for.end ]
+  br label %for.end
+
+for.end:
+  %malicious.phi = phi i32 [ 0, %for.cond1.preheader ]
+  %inc = add nsw i32 %inc10, 1
+  %tobool = icmp eq i32 %inc, 0
+  br i1 %tobool, label %for.cond.for.end5, label %for.cond1.preheader
+
+for.cond.for.end5:
+  %and.lcssa = phi i32 [ %malicious.phi, %for.end ]
+  store i32 %and.lcssa, i32* %a, align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,245 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at a = global i32* null, align 8
+ at b = global i32* null, align 8
+ at c = global i32* null, align 8
+
+; Don't create an exponetial IR for the edge masks needed when if-converting
+; this code.
+
+; PR16472
+
+; CHECK-NOT: %6000000 =
+
+define void @_Z3fn4i(i32 %p1) {
+entry:
+  %cmp88 = icmp sgt i32 %p1, 0
+  br i1 %cmp88, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load i32*, i32** @b, align 8
+  %1 = load i32*, i32** @a, align 8
+  %2 = load i32*, i32** @c, align 8
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %_ZL3fn3ii.exit58 ]
+  %arrayidx = getelementptr inbounds i32, i32* %0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx, align 4  %4 = trunc i64 %indvars.iv to i32
+  %and.i = and i32 %4, 1
+  %tobool.i.i = icmp eq i32 %and.i, 0
+  br i1 %tobool.i.i, label %if.end.i, label %if.then.i
+
+if.then.i:
+  %and.i.i = lshr i32 %3, 2
+  %and.lobit.i.i = and i32 %and.i.i, 1
+  %5 = xor i32 %and.lobit.i.i, 1
+  %or.i.i = or i32 %5, %3
+  %cmp.i = icmp sgt i32 %or.i.i, 0
+  %conv.i = zext i1 %cmp.i to i32
+  br label %if.end.i
+
+if.end.i:
+  %tobool.i87 = phi i1 [ true, %if.then.i ], [ false, %for.body ]
+  %p1.addr.0.i = phi i32 [ %conv.i, %if.then.i ], [ %3, %for.body ]
+  %6 = trunc i64 %indvars.iv to i32
+  %and1.i = and i32 %6, 7
+  %tobool2.i = icmp eq i32 %and1.i, 0
+  br i1 %tobool2.i, label %if.end7.i, label %if.then3.i
+
+if.then3.i:
+  %p1.addr.0.lobit.i = lshr i32 %p1.addr.0.i, 31
+  %and6.i = and i32 %p1.addr.0.i, 1
+  %or.i = or i32 %p1.addr.0.lobit.i, %and6.i
+  br label %if.end7.i
+
+if.end7.i:
+  %p1.addr.1.i = phi i32 [ %or.i, %if.then3.i ], [ %p1.addr.0.i, %if.end.i ]
+  br i1 %tobool.i87, label %if.then10.i, label %if.end13.i
+
+if.then10.i:
+  %cmp11.i = icmp sgt i32 %p1.addr.1.i, 0
+  %conv12.i = zext i1 %cmp11.i to i32
+  br label %if.end13.i
+
+if.end13.i:
+  %p1.addr.2.i = phi i32 [ %conv12.i, %if.then10.i ], [ %p1.addr.1.i, %if.end7.i ]
+  br i1 %tobool.i.i, label %_Z3fn2iii.exit, label %if.then16.i
+
+if.then16.i:
+  %and17.i = lshr i32 %p1.addr.2.i, 3
+  %and17.lobit.i = and i32 %and17.i, 1
+  br label %_Z3fn2iii.exit
+
+_Z3fn2iii.exit:
+  %p1.addr.3.i = phi i32 [ %and17.lobit.i, %if.then16.i ], [ %p1.addr.2.i, %if.end13.i ]
+  %7 = trunc i64 %indvars.iv to i32
+  %shr.i = ashr i32 %7, 1
+  %and.i18.i = and i32 %shr.i, 1
+  %tobool.i19.i = icmp ne i32 %and.i18.i, 0
+  br i1 %tobool.i19.i, label %if.then.i20.i, label %if.end.i.i
+
+if.then.i20.i:
+  %cmp.i.i = icmp sgt i32 %p1.addr.3.i, 0
+  %conv.i.i = zext i1 %cmp.i.i to i32
+  br label %if.end.i.i
+
+if.end.i.i:
+  %p1.addr.0.i21.i = phi i32 [ %conv.i.i, %if.then.i20.i ], [ %p1.addr.3.i, %_Z3fn2iii.exit ]
+  %and1.i.i = and i32 %shr.i, 7
+  %tobool2.i.i = icmp eq i32 %and1.i.i, 0
+  br i1 %tobool2.i.i, label %if.end7.i.i, label %if.then3.i.i
+
+if.then3.i.i:
+  %p1.addr.0.lobit.i.i = lshr i32 %p1.addr.0.i21.i, 31
+  %and6.i.i = and i32 %p1.addr.0.i21.i, 1
+  %or.i22.i = or i32 %p1.addr.0.lobit.i.i, %and6.i.i
+  br label %if.end7.i.i
+
+if.end7.i.i:
+  %p1.addr.1.i.i = phi i32 [ %or.i22.i, %if.then3.i.i ], [ %p1.addr.0.i21.i, %if.end.i.i ]
+  br i1 %tobool.i19.i, label %if.then10.i.i, label %if.end13.i.i
+
+if.then10.i.i:
+  %cmp11.i.i = icmp sgt i32 %p1.addr.1.i.i, 0
+  %conv12.i.i = zext i1 %cmp11.i.i to i32
+  br label %if.end13.i.i
+
+if.end13.i.i:
+  %p1.addr.2.i.i = phi i32 [ %conv12.i.i, %if.then10.i.i ], [ %p1.addr.1.i.i, %if.end7.i.i ]
+  %and14.i.i = and i32 %shr.i, 5
+  %tobool15.i.i = icmp eq i32 %and14.i.i, 0
+  br i1 %tobool15.i.i, label %_Z3fn2iii.exit.i, label %if.then16.i.i
+
+if.then16.i.i:
+  %and17.i.i = lshr i32 %p1.addr.2.i.i, 3
+  %and17.lobit.i.i = and i32 %and17.i.i, 1
+  br label %_Z3fn2iii.exit.i
+
+_Z3fn2iii.exit.i:
+  %p1.addr.3.i.i = phi i32 [ %and17.lobit.i.i, %if.then16.i.i ], [ %p1.addr.2.i.i, %if.end13.i.i ]
+  %8 = trunc i64 %indvars.iv to i32
+  %tobool.i11.i = icmp eq i32 %8, 0
+  br i1 %tobool.i11.i, label %_ZL3fn3ii.exit, label %if.then.i15.i
+
+if.then.i15.i:
+  %and.i12.i = lshr i32 %p1.addr.3.i.i, 2
+  %and.lobit.i13.i = and i32 %and.i12.i, 1
+  %9 = xor i32 %and.lobit.i13.i, 1
+  %or.i14.i = or i32 %9, %p1.addr.3.i.i
+  br label %_ZL3fn3ii.exit
+
+_ZL3fn3ii.exit:
+  %p1.addr.0.i16.i = phi i32 [ %or.i14.i, %if.then.i15.i ], [ %p1.addr.3.i.i, %_Z3fn2iii.exit.i ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv
+  store i32 %p1.addr.0.i16.i, i32* %arrayidx2, align 4  %arrayidx4 = getelementptr inbounds i32, i32* %0, i64 %indvars.iv
+  %10 = load i32, i32* %arrayidx4, align 4  br i1 %tobool.i.i, label %_Z3fn1ii.exit.i26, label %if.then.i.i21
+
+if.then.i.i21:
+  %and.i.i18 = lshr i32 %10, 2
+  %and.lobit.i.i19 = and i32 %and.i.i18, 1
+  %11 = xor i32 %and.lobit.i.i19, 1
+  %or.i.i20 = or i32 %11, %10
+  br label %_Z3fn1ii.exit.i26
+
+_Z3fn1ii.exit.i26:
+  %p1.addr.0.i.i22 = phi i32 [ %or.i.i20, %if.then.i.i21 ], [ %10, %_ZL3fn3ii.exit ]
+  br i1 %tobool.i87, label %if.then.i63, label %if.end.i67
+
+if.then.i63:
+  %cmp.i61 = icmp sgt i32 %p1.addr.0.i.i22, 0
+  %conv.i62 = zext i1 %cmp.i61 to i32
+  br label %if.end.i67
+
+if.end.i67:
+  %p1.addr.0.i64 = phi i32 [ %conv.i62, %if.then.i63 ], [ %p1.addr.0.i.i22, %_Z3fn1ii.exit.i26 ]
+  br i1 %tobool2.i, label %if.end7.i73, label %if.then3.i71
+
+if.then3.i71:
+  %p1.addr.0.lobit.i68 = lshr i32 %p1.addr.0.i64, 31
+  %and6.i69 = and i32 %p1.addr.0.i64, 1
+  %or.i70 = or i32 %p1.addr.0.lobit.i68, %and6.i69
+  br label %if.end7.i73
+
+if.end7.i73:
+  %p1.addr.1.i72 = phi i32 [ %or.i70, %if.then3.i71 ], [ %p1.addr.0.i64, %if.end.i67 ]
+  br i1 %tobool.i87, label %if.then10.i76, label %if.end13.i80
+
+if.then10.i76:
+  %cmp11.i74 = icmp sgt i32 %p1.addr.1.i72, 0
+  %conv12.i75 = zext i1 %cmp11.i74 to i32
+  br label %if.end13.i80
+
+if.end13.i80:
+  %p1.addr.2.i77 = phi i32 [ %conv12.i75, %if.then10.i76 ], [ %p1.addr.1.i72, %if.end7.i73 ]
+  br i1 %tobool.i.i, label %_Z3fn2iii.exit85, label %if.then16.i83
+
+if.then16.i83:
+  %and17.i81 = lshr i32 %p1.addr.2.i77, 3
+  %and17.lobit.i82 = and i32 %and17.i81, 1
+  br label %_Z3fn2iii.exit85
+
+_Z3fn2iii.exit85:
+  %p1.addr.3.i84 = phi i32 [ %and17.lobit.i82, %if.then16.i83 ], [ %p1.addr.2.i77, %if.end13.i80 ]
+  br i1 %tobool.i19.i, label %if.then.i20.i29, label %if.end.i.i33
+
+if.then.i20.i29:
+  %cmp.i.i27 = icmp sgt i32 %p1.addr.3.i84, 0
+  %conv.i.i28 = zext i1 %cmp.i.i27 to i32
+  br label %if.end.i.i33
+
+if.end.i.i33:
+  %p1.addr.0.i21.i30 = phi i32 [ %conv.i.i28, %if.then.i20.i29 ], [ %p1.addr.3.i84, %_Z3fn2iii.exit85 ]
+  br i1 %tobool2.i.i, label %if.end7.i.i39, label %if.then3.i.i37
+
+if.then3.i.i37:
+  %p1.addr.0.lobit.i.i34 = lshr i32 %p1.addr.0.i21.i30, 31
+  %and6.i.i35 = and i32 %p1.addr.0.i21.i30, 1
+  %or.i22.i36 = or i32 %p1.addr.0.lobit.i.i34, %and6.i.i35
+  br label %if.end7.i.i39
+
+if.end7.i.i39:
+  %p1.addr.1.i.i38 = phi i32 [ %or.i22.i36, %if.then3.i.i37 ], [ %p1.addr.0.i21.i30, %if.end.i.i33 ]
+  br i1 %tobool.i19.i, label %if.then10.i.i42, label %if.end13.i.i46
+
+if.then10.i.i42:
+  %cmp11.i.i40 = icmp sgt i32 %p1.addr.1.i.i38, 0
+  %conv12.i.i41 = zext i1 %cmp11.i.i40 to i32
+  br label %if.end13.i.i46
+
+if.end13.i.i46:
+  %p1.addr.2.i.i43 = phi i32 [ %conv12.i.i41, %if.then10.i.i42 ], [ %p1.addr.1.i.i38, %if.end7.i.i39 ]
+  br i1 %tobool15.i.i, label %_Z3fn2iii.exit.i52, label %if.then16.i.i49
+
+if.then16.i.i49:
+  %and17.i.i47 = lshr i32 %p1.addr.2.i.i43, 3
+  %and17.lobit.i.i48 = and i32 %and17.i.i47, 1
+  br label %_Z3fn2iii.exit.i52
+
+_Z3fn2iii.exit.i52:
+  %p1.addr.3.i.i50 = phi i32 [ %and17.lobit.i.i48, %if.then16.i.i49 ], [ %p1.addr.2.i.i43, %if.end13.i.i46 ]
+  br i1 %tobool.i11.i, label %_ZL3fn3ii.exit58, label %if.then.i15.i56
+
+if.then.i15.i56:
+  %and.i12.i53 = lshr i32 %p1.addr.3.i.i50, 2
+  %and.lobit.i13.i54 = and i32 %and.i12.i53, 1
+  %12 = xor i32 %and.lobit.i13.i54, 1
+  %or.i14.i55 = or i32 %12, %p1.addr.3.i.i50
+  br label %_ZL3fn3ii.exit58
+
+_ZL3fn3ii.exit58:
+  %p1.addr.0.i16.i57 = phi i32 [ %or.i14.i55, %if.then.i15.i56 ], [ %p1.addr.3.i.i50, %_Z3fn2iii.exit.i52 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv
+  store i32 %p1.addr.0.i16.i57, i32* %arrayidx7, align 4  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %p1
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  br label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/if-conversion-nest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/if-conversion-nest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/if-conversion-nest.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/if-conversion-nest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP26]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP15:%.*]] = and <4 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP17:%.*]] = and <4 x i1> [[TMP11]], [[TMP16]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[TMP14]], <4 x i32> [[PREDPHI]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI7]], <4 x i32>* [[TMP18]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP20]], 19
+; CHECK-NEXT:    br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[TMP21]], 4
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP10]], i32 4, i32 5
+; CHECK-NEXT:    br label [[IF_END14]]
+; CHECK:       if.end14:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 9, [[FOR_BODY]] ], [ 3, [[IF_THEN]] ], [ [[DOT]], [[IF_ELSE]] ]
+; CHECK-NEXT:    store i32 [[X_0]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %cmp26 = icmp sgt i32 %n, 0
+  br i1 %cmp26, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.end14 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %cmp3 = icmp sgt i32 %0, %1
+  br i1 %cmp3, label %if.then, label %if.end14
+
+if.then:
+  %cmp6 = icmp sgt i32 %0, 19
+  br i1 %cmp6, label %if.end14, label %if.else
+
+if.else:
+  %cmp10 = icmp slt i32 %1, 4
+  %. = select i1 %cmp10, i32 4, i32 5
+  br label %if.end14
+
+if.end14:
+  %x.0 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %., %if.else ]  ; <------------- A PHI with 3 entries that we can still vectorize.
+  store i32 %x.0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/if-conversion-reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/if-conversion-reduction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/if-conversion-reduction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/if-conversion-reduction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @reduction_func(
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_func(i32* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %sum.011 = phi i32 [ %sum.1, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 30
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %add = add i32 %sum.011, 2
+  %add4 = add i32 %add, %0
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %sum.1 = phi i32 [ %add4, %if.then ], [ %sum.011, %for.body ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ 4, %for.inc ]
+  ret i32 %sum.0.lcssa
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/if-conversion.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/if-conversion.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/if-conversion.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/if-conversion.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,197 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; This is the loop in this example:
+;
+;int function0(int *a, int *b, int start, int end) {
+;
+;  for (int i=start; i<end; ++i) {
+;    unsigned k = a[i];
+;
+;    if (a[i] > b[i])   <------ notice the IF inside the loop.
+;      k = k * 5 + 3;
+;
+;    a[i] = k;  <---- K is a phi node that becomes vector-select.
+;  }
+;}
+
+;CHECK-LABEL: @function0(
+;CHECK: load <4 x i32>
+;CHECK: icmp sgt <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: select <4 x i1>
+;CHECK: ret i32
+define i32 @function0(i32* nocapture %a, i32* nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
+entry:
+  %cmp16 = icmp slt i32 %start, %end
+  br i1 %cmp16, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = sext i32 %start to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx4, align 4
+  %cmp5 = icmp sgt i32 %1, %2
+  br i1 %cmp5, label %if.then, label %if.end
+
+if.then:
+  %mul = mul i32 %1, 5
+  %add = add i32 %mul, 3
+  br label %if.end
+
+if.end:
+  %k.0 = phi i32 [ %add, %if.then ], [ %1, %for.body ]
+  store i32 %k.0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %3 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %3, %end
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret i32 undef
+}
+
+
+
+; int func(int *A, int n) {
+;   unsigned sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     if (A[i] > 30)
+;       sum += A[i] + 2;
+;
+;   return sum;
+; }
+
+;CHECK-LABEL: @reduction_func(
+;CHECK: load <4 x i32>
+;CHECK: icmp slt <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: select <4 x i1>
+;CHECK: ret i32
+define i32 @reduction_func(i32* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %sum.011 = phi i32 [ %sum.1, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 30
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %add = add i32 %sum.011, 2
+  %add4 = add i32 %add, %0
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %sum.1 = phi i32 [ %add4, %if.then ], [ %sum.011, %for.body ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %sum.1, %for.inc ]
+  ret i32 %sum.0.lcssa
+}
+
+ at a = common global [1 x i32*] zeroinitializer, align 8
+ at c = common global i32* null, align 8
+
+; We use to if convert this loop. This is not safe because there is a trapping
+; constant expression.
+; PR16729
+
+; CHECK-LABEL: trapping_constant_expression
+; CHECK-NOT: or <4 x i32>
+
+define i32 @trapping_constant_expression() {
+entry:
+  br label %for.body
+
+for.body:
+  %inc3 = phi i32 [ 0, %entry ], [ %inc, %cond.end ]
+  %or2 = phi i32 [ 0, %entry ], [ %or, %cond.end ]
+  br i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 0), i32** @c), label %cond.false, label %cond.end
+
+cond.false:
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 0), i32** @c) to i32)), %cond.false ], [ 0, %for.body ]
+  %or = or i32 %or2, %cond
+  %inc = add nsw i32 %inc3, 1
+  %cmp = icmp slt i32 %inc, 128
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret i32 %or
+}
+
+; Neither should we if-convert if there is an instruction operand that is a
+; trapping constant expression.
+; PR16729
+
+; CHECK-LABEL: trapping_constant_expression2
+; CHECK-NOT: or <4 x i32>
+
+define i32 @trapping_constant_expression2() {
+entry:
+  br label %for.body
+
+for.body:
+  %inc3 = phi i32 [ 0, %entry ], [ %inc, %cond.end ]
+  %or2 = phi i32 [ 0, %entry ], [ %or, %cond.end ]
+  br i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 0), i32** @c), label %cond.false, label %cond.end
+
+cond.false:
+  %cond.1 = or i32 %inc3, sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32))
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ %cond.1, %cond.false ], [ %inc3, %for.body ]
+  %or = or i32 %or2, %cond
+  %inc = add nsw i32 %inc3, 1
+  %cmp = icmp slt i32 %inc, 128
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret i32 %or
+}
+
+; Handle PHI with single incoming value having a full mask.
+; PR34523
+
+; CHECK-LABEL: PR34523
+; CHECK: vector.body
+
+define void @PR34523() {
+bb1:
+  br label %bb2
+
+bb2:                                             ; preds = %bb4, %bb1
+  %i = phi i16 [ undef, %bb1 ], [ %_tmp2, %bb4 ]
+  br label %bb3
+
+bb3:                                             ; preds = %bb2
+  %_tmp1 = phi [1 x [1 x i32]]* [ undef, %bb2 ]
+  br label %bb4
+
+bb4:                                             ; preds = %bb3
+  %_tmp2 = add i16 %i, 1
+  %_tmp3 = icmp slt i16 %_tmp2, 2
+  br i1 %_tmp3, label %bb2, label %bb5
+
+bb5:                                             ; preds = %bb4
+  unreachable
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,277 @@
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s
+; RUN: opt -S -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NO-VF
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test predication of non-void instructions, specifically (i) that these
+; instructions permit vectorization and (ii) the creation of an insertelement
+; and a Phi node. We check the full 2-element sequence for the first
+; instruction; For the rest we'll just make sure they get predicated based
+; on the code generated for the first element.
+define void @test(i32* nocapture %asd, i32* nocapture %aud,
+                  i32* nocapture %asr, i32* nocapture %aur) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK:   %[[SDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   br i1 %[[SDEE]], label %[[CSD:[a-zA-Z0-9.]+]], label %[[ESD:[a-zA-Z0-9.]+]]
+; CHECK: [[CSD]]:
+; CHECK:   %[[SDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SD0:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0]], %[[SDA1]]
+; CHECK:   %[[SD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SD0]], i32 0
+; CHECK:   br label %[[ESD]]
+; CHECK: [[ESD]]:
+; CHECK:   %[[SDR:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[SD1]], %[[CSD]] ]
+; CHECK:   %[[SDEEH:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 1
+; CHECK:   br i1 %[[SDEEH]], label %[[CSDH:[a-zA-Z0-9.]+]], label %[[ESDH:[a-zA-Z0-9.]+]]
+; CHECK: [[CSDH]]:
+; CHECK:   %[[SDA0H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1
+; CHECK:   %[[SDA1H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1
+; CHECK:   %[[SD0H:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0H]], %[[SDA1H]]
+; CHECK:   %[[SD1H:[a-zA-Z0-9]+]] = insertelement <2 x i32> %[[SDR]], i32 %[[SD0H]], i32 1
+; CHECK:   br label %[[ESDH]]
+; CHECK: [[ESDH]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ %[[SDR]], %[[ESD]] ], [ %[[SD1H]], %[[CSDH]] ]
+
+; CHECK:   %[[UDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   br i1 %[[UDEE]], label %[[CUD:[a-zA-Z0-9.]+]], label %[[EUD:[a-zA-Z0-9.]+]]
+; CHECK: [[CUD]]:
+; CHECK:   %[[UDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UD0:[a-zA-Z0-9]+]] = udiv i32 %[[UDA0]], %[[UDA1]]
+; CHECK:   %[[UD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UD0]], i32 0
+; CHECK:   br label %[[EUD]]
+; CHECK: [[EUD]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UD1]], %[[CUD]] ]
+
+; CHECK:   %[[SREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   br i1 %[[SREE]], label %[[CSR:[a-zA-Z0-9.]+]], label %[[ESR:[a-zA-Z0-9.]+]]
+; CHECK: [[CSR]]:
+; CHECK:   %[[SRA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SRA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SR0:[a-zA-Z0-9]+]] = srem i32 %[[SRA0]], %[[SRA1]]
+; CHECK:   %[[SR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SR0]], i32 0
+; CHECK:   br label %[[ESR]]
+; CHECK: [[ESR]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[SR1]], %[[CSR]] ]
+
+; CHECK:   %[[UREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   br i1 %[[UREE]], label %[[CUR:[a-zA-Z0-9.]+]], label %[[EUR:[a-zA-Z0-9.]+]]
+; CHECK: [[CUR]]:
+; CHECK:   %[[URA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[URA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UR0:[a-zA-Z0-9]+]] = urem i32 %[[URA0]], %[[URA1]]
+; CHECK:   %[[UR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UR0]], i32 0
+; CHECK:   br label %[[EUR]]
+; CHECK: [[EUR]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UR1]], %[[CUR]] ]
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv
+  %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv
+  %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %lud = load i32, i32* %iud, align 4
+  %lsr = load i32, i32* %isr, align 4
+  %lur = load i32, i32* %iur, align 4
+  %psd = add nsw i32 %lsd, 23
+  %pud = add nsw i32 %lud, 24
+  %psr = add nsw i32 %lsr, 25
+  %pur = add nsw i32 %lur, 26
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %rsd = sdiv i32 %psd, %lsd
+  %rud = udiv i32 %pud, %lud
+  %rsr = srem i32 %psr, %lsr
+  %rur = urem i32 %pur, %lur
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ]
+  %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ]
+  %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  store i32 %yud.0, i32* %iud, align 4
+  store i32 %ysr.0, i32* %isr, align 4
+  store i32 %yur.0, i32* %iur, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test_scalar2scalar
+; CHECK: vector.body:
+; CHECK:   br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
+; CHECK: [[THEN]]:
+; CHECK:   %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; CHECK:   br label %[[FI]]
+; CHECK: [[FI]]:
+; CHECK:   %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ]
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv
+  %lsd.b = load i32, i32* %isd.b, align 4
+  %psd = add nsw i32 %lsd, 23
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %sd1 = sdiv i32 %psd, %lsd
+  %rsd = sdiv i32 %lsd.b, %sd1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: pr30172
+; CHECK: vector.body:
+; CHECK: %[[CMP1:.+]] = icmp slt <2 x i32> %[[VAL:.+]], <i32 100, i32 100>
+; CHECK: %[[CMP2:.+]] = icmp sge <2 x i32> %[[VAL]], <i32 200, i32 200>
+; CHECK: %[[NOT:.+]] = xor <2 x i1> %[[CMP1]], <i1 true, i1 true>
+; CHECK: %[[AND:.+]] = and <2 x i1> %[[CMP2]], %[[NOT]]
+; CHECK: %[[OR:.+]] = or <2 x i1> %[[AND]], %[[CMP1]]
+; CHECK: %[[EXTRACT:.+]] = extractelement <2 x i1> %[[OR]], i32 0
+; CHECK: br i1 %[[EXTRACT]], label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
+; CHECK: [[THEN]]:
+; CHECK:   %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; CHECK:   br label %[[FI]]
+; CHECK: [[FI]]:
+; CHECK:   %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ]
+
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv
+  %lsd.b = load i32, i32* %isd.b, align 4
+  %psd = add nsw i32 %lsd, 23
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %check
+
+check:                                            ; preds = %for.body
+  %cmp2 = icmp sge i32 %lsd, 200
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %check, %for.body
+  %sd1 = sdiv i32 %psd, %lsd
+  %rsd = sdiv i32 %lsd.b, %sd1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %check
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %check ] 
+  store i32 %ysd.0, i32* %isd, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+; CHECK: vector.body:
+; CHECK:   %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4
+; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK:   %[[T00:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; CHECK:   %[[T01:.+]] = add nsw i32 %[[T00]], %x
+; CHECK:   %[[T02:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; CHECK:   %[[T03:.+]] = udiv i32 %[[T02]], %[[T01]]
+; CHECK:   %[[T04:.+]] = insertelement <2 x i32> undef, i32 %[[T03]], i32 0
+; CHECK:   br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK:   %[[T05:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
+; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK:   %[[T06:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; CHECK:   %[[T07:.+]] = add nsw i32 %[[T06]], %x
+; CHECK:   %[[T08:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; CHECK:   %[[T09:.+]] = udiv i32 %[[T08]], %[[T07]]
+; CHECK:   %[[T10:.+]] = insertelement <2 x i32> %[[T05]], i32 %[[T09]], i32 1
+; CHECK:   br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK:   phi <2 x i32> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
+; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
+
+; Test predicating an instruction that feeds a vectorizable use, when unrolled
+; but not vectorized. Derived from pr34248 reproducer.
+;
+; UNROLL-NO-VF-LABEL: predicated_udiv_scalarized_operand
+; UNROLL-NO-VF: vector.body:
+; UNROLL-NO-VF:   %[[LOAD0:.+]] = load i32, i32*
+; UNROLL-NO-VF:   %[[LOAD1:.+]] = load i32, i32*
+; UNROLL-NO-VF:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; UNROLL-NO-VF: [[IF0]]:
+; UNROLL-NO-VF:   %[[ADD0:.+]] = add nsw i32 %[[LOAD0]], %x
+; UNROLL-NO-VF:   %[[DIV0:.+]] = udiv i32 %[[LOAD0]], %[[ADD0]]
+; UNROLL-NO-VF:   br label %[[CONT0]]
+; UNROLL-NO-VF: [[CONT0]]:
+; UNROLL-NO-VF:   phi i32 [ undef, %vector.body ], [ %[[DIV0]], %[[IF0]] ]
+; UNROLL-NO-VF:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; UNROLL-NO-VF: [[IF1]]:
+; UNROLL-NO-VF:   %[[ADD1:.+]] = add nsw i32 %[[LOAD1]], %x
+; UNROLL-NO-VF:   %[[DIV1:.+]] = udiv i32 %[[LOAD1]], %[[ADD1]]
+; UNROLL-NO-VF:   br label %[[CONT1]]
+; UNROLL-NO-VF: [[CONT1]]:
+; UNROLL-NO-VF:   phi i32 [ undef, %[[CONT0]] ], [ %[[DIV1]], %[[IF1]] ]
+; UNROLL-NO-VF:   br i1 {{.*}}, label %middle.block, label %vector.body
+;
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp2 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp3 = add nsw i32 %tmp2, %x
+  %tmp4 = udiv i32 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,89 @@
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test no-predication of instructions that are provably safe, e.g. dividing by
+; a non-zero constant.
+define void @test(i32* nocapture %asd, i32* nocapture %aud,
+                  i32* nocapture %asr, i32* nocapture %aur,
+                  i32* nocapture %asd0, i32* nocapture %aud0,
+                  i32* nocapture %asr0, i32* nocapture %aur0
+) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 11, i32 11>
+; CHECK: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 13, i32 13>
+; CHECK: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 17, i32 17>
+; CHECK: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 19, i32 19>
+; CHECK-NOT: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 0, i32 0>
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv
+  %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv
+  %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %lud = load i32, i32* %iud, align 4
+  %lsr = load i32, i32* %isr, align 4
+  %lur = load i32, i32* %iur, align 4
+  %psd = add nsw i32 %lsd, 23
+  %pud = add nsw i32 %lud, 24
+  %psr = add nsw i32 %lsr, 25
+  %pur = add nsw i32 %lur, 26
+  %isd0 = getelementptr inbounds i32, i32* %asd0, i64 %indvars.iv
+  %iud0 = getelementptr inbounds i32, i32* %aud0, i64 %indvars.iv
+  %isr0 = getelementptr inbounds i32, i32* %asr0, i64 %indvars.iv
+  %iur0 = getelementptr inbounds i32, i32* %aur0, i64 %indvars.iv
+  %lsd0 = load i32, i32* %isd0, align 4
+  %lud0 = load i32, i32* %iud0, align 4
+  %lsr0 = load i32, i32* %isr0, align 4
+  %lur0 = load i32, i32* %iur0, align 4
+  %psd0 = add nsw i32 %lsd, 27
+  %pud0 = add nsw i32 %lud, 28
+  %psr0 = add nsw i32 %lsr, 29
+  %pur0 = add nsw i32 %lur, 30
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %rsd = sdiv i32 %psd, 11
+  %rud = udiv i32 %pud, 13
+  %rsr = srem i32 %psr, 17
+  %rur = urem i32 %pur, 19
+  %rsd0 = sdiv i32 %psd0, 0
+  %rud0 = udiv i32 %pud0, 0
+  %rsr0 = srem i32 %psr0, 0
+  %rur0 = urem i32 %pur0, 0
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ]
+  %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ]
+  %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ]
+  %ysd0.0 = phi i32 [ %rsd0, %if.then ], [ %psd0, %for.body ]
+  %yud0.0 = phi i32 [ %rud0, %if.then ], [ %pud0, %for.body ]
+  %ysr0.0 = phi i32 [ %rsr0, %if.then ], [ %psr0, %for.body ]
+  %yur0.0 = phi i32 [ %rur0, %if.then ], [ %pur0, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  store i32 %yud.0, i32* %iud, align 4
+  store i32 %ysr.0, i32* %isr, align 4
+  store i32 %yur.0, i32* %iur, align 4
+  store i32 %ysd0.0, i32* %isd0, align 4
+  store i32 %yud0.0, i32* %iud0, align 4
+  store i32 %ysr0.0, i32* %isr0, align 4
+  store i32 %yur0.0, i32* %iur0, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/if-pred-stores.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,178 @@
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=UNROLL
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=VEC
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test predication of stores.
+define i32 @test(i32* nocapture %f) #0 {
+entry:
+  br label %for.body
+
+; VEC-LABEL: test
+; VEC:   %[[v0:.+]] = add i64 %index, 0
+; VEC:   %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
+; VEC:   %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
+; VEC:   %[[v11:.+]] = extractelement <2 x i1> %[[v8]], i32 0
+; VEC:   br i1 %[[v11]], label %[[cond:.+]], label %[[else:.+]]
+;
+; VEC: [[cond]]:
+; VEC:   %[[v13:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; VEC:   %[[v9a:.+]] = add nsw i32 %[[v13]], 20
+; VEC:   store i32 %[[v9a]], i32* %[[v2]], align 4
+; VEC:   br label %[[else:.+]]
+;
+; VEC: [[else]]:
+; VEC:   %[[v15:.+]] = extractelement <2 x i1> %[[v8]], i32 1
+; VEC:   br i1 %[[v15]], label %[[cond2:.+]], label %[[else2:.+]]
+;
+; VEC: [[cond2]]:
+; VEC:   %[[v17:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; VEC:   %[[v9b:.+]] = add nsw i32 %[[v17]], 20
+; VEC:   %[[v1:.+]] = add i64 %index, 1
+; VEC:   %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]]
+; VEC:   store i32 %[[v9b]], i32* %[[v4]], align 4
+; VEC:   br label %[[else2:.+]]
+;
+; VEC: [[else2]]:
+
+; UNROLL-LABEL: test
+; UNROLL: vector.body:
+; UNROLL:   %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0
+; UNROLL:   %[[IND1:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 1
+; UNROLL:   %[[v0:[a-zA-Z0-9]+]] = getelementptr inbounds i32, i32* %f, i64 %[[IND]]
+; UNROLL:   %[[v1:[a-zA-Z0-9]+]] = getelementptr inbounds i32, i32* %f, i64 %[[IND1]]
+; UNROLL:   %[[v2:[a-zA-Z0-9]+]] = load i32, i32* %[[v0]], align 4
+; UNROLL:   %[[v3:[a-zA-Z0-9]+]] = load i32, i32* %[[v1]], align 4
+; UNROLL:   %[[v4:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v2]], 100
+; UNROLL:   %[[v5:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v3]], 100
+; UNROLL:   br i1 %[[v4]], label %[[cond:[a-zA-Z0-9.]+]], label %[[else:[a-zA-Z0-9.]+]]
+;
+; UNROLL: [[cond]]:
+; UNROLL:   %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20
+; UNROLL:   store i32 %[[v6]], i32* %[[v0]], align 4
+; UNROLL:   br label %[[else]]
+;
+; UNROLL: [[else]]:
+; UNROLL:   br i1 %[[v5]], label %[[cond2:[a-zA-Z0-9.]+]], label %[[else2:[a-zA-Z0-9.]+]]
+;
+; UNROLL: [[cond2]]:
+; UNROLL:   %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20
+; UNROLL:   store i32 %[[v7]], i32* %[[v1]], align 4
+; UNROLL:   br label %[[else2]]
+;
+; UNROLL: [[else2]]:
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 100
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %add = add nsw i32 %0, 20
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 0
+}
+
+; Track basic blocks when unrolling conditional blocks. This code used to assert
+; because we did not update the phi nodes with the proper predecessor in the
+; vectorized loop body.
+; PR18724
+
+; UNROLL-NOSIMPLIFY-LABEL: bug18724
+; UNROLL-NOSIMPLIFY: store i32
+; UNROLL-NOSIMPLIFY: store i32
+
+define void @bug18724() {
+entry:
+  br label %for.body9
+
+for.body9:
+  br i1 undef, label %for.inc26, label %for.body14
+
+for.body14:
+  %indvars.iv3 = phi i64 [ %indvars.iv.next4, %for.inc23 ], [ undef, %for.body9 ]
+  %iNewChunks.120 = phi i32 [ %iNewChunks.2, %for.inc23 ], [ undef, %for.body9 ]
+  %arrayidx16 = getelementptr inbounds [768 x i32], [768 x i32]* undef, i64 0, i64 %indvars.iv3
+  %tmp = load i32, i32* %arrayidx16, align 4
+  br i1 undef, label %if.then18, label %for.inc23
+
+if.then18:
+  store i32 2, i32* %arrayidx16, align 4
+  %inc21 = add nsw i32 %iNewChunks.120, 1
+  br label %for.inc23
+
+for.inc23:
+  %iNewChunks.2 = phi i32 [ %inc21, %if.then18 ], [ %iNewChunks.120, %for.body14 ]
+  %indvars.iv.next4 = add nsw i64 %indvars.iv3, 1
+  %tmp1 = trunc i64 %indvars.iv3 to i32
+  %cmp13 = icmp slt i32 %tmp1, 0
+  br i1 %cmp13, label %for.body14, label %for.inc26
+
+for.inc26:
+  %iNewChunks.1.lcssa = phi i32 [ undef, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
+  unreachable
+}
+
+; VEC-LABEL: @minimal_bit_widths(
+;
+; In the test below, it's more profitable for the expression feeding the
+; conditional store to remain scalar. Since we can only type-shrink vector
+; types, we shouldn't try to represent the expression in a smaller type.
+;
+; VEC: vector.body:
+; VEC:   %wide.load = load <2 x i8>, <2 x i8>* {{.*}}, align 1
+; VEC:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; VEC: [[IF0]]:
+; VEC:   %[[E0:.+]] = extractelement <2 x i8> %wide.load, i32 0
+; VEC:   %[[Z0:.+]] = zext i8 %[[E0]] to i32
+; VEC:   %[[T0:.+]] = trunc i32 %[[Z0]] to i8
+; VEC:   store i8 %[[T0]], i8* {{.*}}, align 1
+; VEC:   br label %[[CONT0]]
+; VEC: [[CONT0]]:
+; VEC:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; VEC: [[IF1]]:
+; VEC:   %[[E1:.+]] = extractelement <2 x i8> %wide.load, i32 1
+; VEC:   %[[Z1:.+]] = zext i8 %[[E1]] to i32
+; VEC:   %[[T1:.+]] = trunc i32 %[[Z1]] to i8
+; VEC:   store i8 %[[T1]], i8* {{.*}}, align 1
+; VEC:   br label %[[CONT1]]
+; VEC: [[CONT1]]:
+; VEC:   br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @minimal_bit_widths(i1 %c) {
+entry:
+  br label %for.body
+
+for.body:
+  %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
+  %tmp1 = phi i64 [ %tmp7, %for.inc ], [ undef, %entry ]
+  %tmp2 = getelementptr i8, i8* undef, i64 %tmp0
+  %tmp3 = load i8, i8* %tmp2, align 1
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp4 = zext i8 %tmp3 to i32
+  %tmp5 = trunc i32 %tmp4 to i8
+  store i8 %tmp5, i8* %tmp2, align 1
+  br label %for.inc
+
+for.inc:
+  %tmp6 = add nuw nsw i64 %tmp0, 1
+  %tmp7 = add i64 %tmp1, -1
+  %tmp8 = icmp eq i64 %tmp7, 0
+  br i1 %tmp8, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/if-reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/if-reduction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/if-reduction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/if-reduction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,821 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fadd_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fadd_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fadd_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fadd_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fadd_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fadd_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and a floating-point
+;   value.
+;
+; float fcmp_val_fadd_select1(float * restrict x, float y, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_val_fadd_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat2
+; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_val_fadd_select1(float* noalias %x, float %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, %y
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and a floating-point
+;   value.
+;
+; double fcmp_val_fadd_select2(double * restrict x, double y, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_val_fadd_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat2
+; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_val_fadd_select2(double* noalias %x, double %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, %y
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and another array
+;   element.
+;
+; float fcmp_array_elm_fadd_select1(float * restrict x, float * restrict y,
+;                                   const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y[i])
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_array_elm_fadd_select1(
+; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %[[V1:.*]]
+; CHECK: %[[V4:.*]] = fadd fast <4 x float> %[[V0]], %[[V3:.*]]
+; CHECK: select <4 x i1> %[[V2]], <4 x float> %[[V4]], <4 x float> %[[V3]]
+define float @fcmp_array_elm_fadd_select1(float* noalias %x, float* noalias %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %1 = load float, float* %arrayidx.2, align 4
+  %cmp.2 = fcmp fast ogt float %0, %1
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %2 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %2
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and another array
+;   element.
+;
+; double fcmp_array_elm_fadd_select2(double * restrict x, double * restrict y,
+;                                    const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y[i])
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_array_elm_fadd_select2(
+; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %[[V1:.*]]
+; CHECK: %[[V4:.*]] = fadd fast <4 x double> %[[V0]], %[[V3:.*]]
+; CHECK: select <4 x i1> %[[V2]], <4 x double> %[[V4]], <4 x double> %[[V3]]
+define double @fcmp_array_elm_fadd_select2(double* noalias %x, double* noalias %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx.1 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %1 = load double, double* %arrayidx.2, align 4
+  %cmp.2 = fcmp fast ogt double %0, %1
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %2 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %2
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fsub instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fsub_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fsub fast <4 x float> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fsub_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
+  %sub = fsub fast float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %sub, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Float pattern:
+;   Check that is not vectorized if fp-instruction has no fast-math property.
+; float fcmp_0_fsub_select1_novectorize(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select1_novectorize(
+; CHECK-NOT: <4 x float>
+define float @fcmp_0_fsub_select1_novectorize(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %sub = fsub float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %sub, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fsub instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fsub_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fsub fast <4 x double> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fsub_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
+  %sub = fsub fast double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %sub, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Double pattern:
+; Check that is not vectorized if fp-instruction has no fast-math property. 
+;
+; double fcmp_0_fsub_select2_notvectorize(double * restrict x, const int N) {
+;   double sum = 0.                                              
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select2_notvectorize(
+; CHECK-NOT: <4 x doubole>
+define double @fcmp_0_fsub_select2_notvectorize(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp ogt double %0, 0.000000e+00
+  %sub = fsub double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %sub, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fmul instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fmult_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fmul fast <4 x float> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fmult_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
+  %mult = fmul fast float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %mult, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Float pattern:
+;   Check that is not vectorized if fp-instruction has no fast-math property. 
+;
+; float fcmp_0_fmult_select1_notvectorize(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select1_notvectorize(
+; CHECK-NOT: <4 x float>
+define float @fcmp_0_fmult_select1_notvectorize(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %mult = fmul float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %mult, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fmul instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fmult_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fmul fast <4 x double> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fmult_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
+  %mult = fmul fast double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %mult, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Double pattern:
+;   Check that is not vectorized if fp-instruction has no fast-math property.
+;
+; double fcmp_0_fmult_select2_notvectorize(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select2_notvectorize(
+; CHECK-NOT: <4 x double>
+define double @fcmp_0_fmult_select2_notvectorize(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp ogt double %0, 0.000000e+00
+  %mult = fmul double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %mult, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float multi pattern
+;   Check vectorisation of reduction code with a pair of selects to different
+;   fadd patterns.
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum+=2*a[i];
+;     else
+;       sum+=3*a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_multi(
+; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
+; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[M2:.*]] = fmul fast <4 x float> %[[V0]], <float 2.000000e+00,
+; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
+; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
+; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
+; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
+; CHECK: %[[S1:.*]] = select <4 x i1> %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]]
+; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]]
+; CHECK: fadd fast <4 x float> %[[S2]],
+define float @fcmp_multi(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.011 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %for.inc, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %if.else14
+
+if.then10:                                        ; preds = %if.else
+  %mul = fmul fast float %0, 2.000000e+00
+  br label %for.inc
+
+if.else14:                                        ; preds = %if.else
+  %mul17 = fmul fast float %0, 3.000000e+00
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.else14, %if.then10
+  %.pn = phi float [ %mul, %if.then10 ], [ %mul17, %if.else14 ], [ %0, %for.body ]
+  %sum.1 = fadd fast float %.pn, %sum.011
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + fsub patterns
+;   Check vectorisation of reduction code with a pair of selects to different
+;   instructions { fadd, fsub } but equivalent (change in constant).
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum-=a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_fadd_fsub(
+; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
+; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float>
+; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float>
+; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
+; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
+; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
+; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
+; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]]
+; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]]
+define float @fcmp_fadd_fsub(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %add = fadd fast float %0, %sum.010
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %for.inc
+
+if.then10:                                        ; preds = %if.else
+  %sub = fsub fast float %sum.010, %0
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then10, %if.else
+  %sum.1 = phi float [ %add, %if.then ], [ %sub, %if.then10 ], [ %sum.010, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + fmul patterns
+;   Check lack of vectorisation of reduction code with a pair of non-compatible
+;   instructions { fadd, fmul }.
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum*=a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_fadd_fmul(
+; CHECK-NOT: <4 x float>
+define float @fcmp_fadd_fmul(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %add = fadd fast float %0, %sum.010
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %for.inc
+
+if.then10:                                        ; preds = %if.else
+  %mul = fmul fast float %0, %sum.010
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then10, %if.else
+  %sum.1 = phi float [ %add, %if.then ], [ %mul, %if.then10 ], [ %sum.010, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + store patterns
+;   Check lack of vectorisation of reduction code with a store back, given it
+;   has loop dependency on a[i].
+;
+; float fcmp_store_back(float a[], int LEN) {
+;     float sum = 0.0;
+;     for (int i = 0; i < LEN; i++) {
+;       sum += a[i];
+;       a[i] = sum;
+;     }
+;     return sum;
+; }
+
+; CHECK-LABEL: @fcmp_store_back(
+; CHECK-NOT: <4 x float>
+define float @fcmp_store_back(float* nocapture %a, i32 %LEN) nounwind readonly {
+entry:
+  %cmp7 = icmp sgt i32 %LEN, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %LEN to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.08 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd fast float %0, %sum.08
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  ret float %sum.0.lcssa
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/incorrect-dom-info.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/incorrect-dom-info.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/incorrect-dom-info.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/incorrect-dom-info.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,142 @@
+; This test is based on one of benchmarks from SPEC2006. It exposes a bug with
+; incorrect updating of the dom-tree.
+; RUN: opt < %s  -loop-vectorize -verify-dom-info
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+ at PL_utf8skip = external constant [0 x i8]
+
+; Function Attrs: nounwind ssp uwtable
+define void @Perl_pp_quotemeta() #0 {
+  %len = alloca i64, align 8
+  br i1 undef, label %2, label %1
+
+; <label>:1                                       ; preds = %0
+  br label %3
+
+; <label>:2                                       ; preds = %0
+  br label %3
+
+; <label>:3                                       ; preds = %2, %1
+  br i1 undef, label %34, label %4
+
+; <label>:4                                       ; preds = %3
+  br i1 undef, label %5, label %6
+
+; <label>:5                                       ; preds = %4
+  br label %6
+
+; <label>:6                                       ; preds = %5, %4
+  br i1 undef, label %7, label %8
+
+; <label>:7                                       ; preds = %6
+  br label %8
+
+; <label>:8                                       ; preds = %7, %6
+  br i1 undef, label %.preheader, label %9
+
+.preheader:                                       ; preds = %9, %8
+  br i1 undef, label %.loopexit, label %.lr.ph
+
+; <label>:9                                       ; preds = %8
+  br i1 undef, label %thread-pre-split.preheader, label %.preheader
+
+thread-pre-split.preheader:                       ; preds = %9
+  br i1 undef, label %thread-pre-split._crit_edge, label %.lr.ph21
+
+.thread-pre-split.loopexit_crit_edge:             ; preds = %19
+  %scevgep.sum = xor i64 %umax, -1
+  %scevgep45 = getelementptr i8, i8* %d.020, i64 %scevgep.sum
+  br label %thread-pre-split.loopexit
+
+thread-pre-split.loopexit:                        ; preds = %11, %.thread-pre-split.loopexit_crit_edge
+  %d.1.lcssa = phi i8* [ %scevgep45, %.thread-pre-split.loopexit_crit_edge ], [ %d.020, %11 ]
+  br i1 false, label %thread-pre-split._crit_edge, label %.lr.ph21
+
+.lr.ph21:                                         ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader
+  %d.020 = phi i8* [ undef, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
+  %10 = phi i64 [ %28, %26 ], [ undef, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
+  br i1 undef, label %11, label %22
+
+; <label>:11                                      ; preds = %.lr.ph21
+  %12 = getelementptr inbounds [0 x i8], [0 x i8]* @PL_utf8skip, i64 0, i64 undef
+  %13 = load i8, i8* %12, align 1
+  %14 = zext i8 %13 to i64
+  %15 = icmp ugt i64 %14, %10
+  %. = select i1 %15, i64 %10, i64 %14
+  br i1 undef, label %thread-pre-split.loopexit, label %.lr.ph28
+
+.lr.ph28:                                         ; preds = %11
+  %16 = xor i64 %10, -1
+  %17 = xor i64 %14, -1
+  %18 = icmp ugt i64 %16, %17
+  %umax = select i1 %18, i64 %16, i64 %17
+  br label %19
+
+; <label>:19                                      ; preds = %19, %.lr.ph28
+  %ulen.126 = phi i64 [ %., %.lr.ph28 ], [ %20, %19 ]
+  %20 = add i64 %ulen.126, -1
+  %21 = icmp eq i64 %20, 0
+  br i1 %21, label %.thread-pre-split.loopexit_crit_edge, label %19
+
+; <label>:22                                      ; preds = %.lr.ph21
+  br i1 undef, label %26, label %23
+
+; <label>:23                                      ; preds = %22
+  br i1 undef, label %26, label %24
+
+; <label>:24                                      ; preds = %23
+  br i1 undef, label %26, label %25
+
+; <label>:25                                      ; preds = %24
+  br label %26
+
+; <label>:26                                      ; preds = %25, %24, %23, %22
+  %27 = load i64, i64* %len, align 8
+  %28 = add i64 %27, -1
+  br i1 undef, label %thread-pre-split._crit_edge, label %.lr.ph21
+
+thread-pre-split._crit_edge:                      ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader
+  br label %.loopexit
+
+.lr.ph:                                           ; preds = %33, %.preheader
+  br i1 undef, label %29, label %thread-pre-split5
+
+; <label>:29                                      ; preds = %.lr.ph
+  br i1 undef, label %33, label %30
+
+; <label>:30                                      ; preds = %29
+  br i1 undef, label %33, label %31
+
+thread-pre-split5:                                ; preds = %.lr.ph
+  br i1 undef, label %33, label %31
+
+; <label>:31                                      ; preds = %thread-pre-split5, %30
+  br i1 undef, label %33, label %32
+
+; <label>:32                                      ; preds = %31
+  br label %33
+
+; <label>:33                                      ; preds = %32, %31, %thread-pre-split5, %30, %29
+  br i1 undef, label %.loopexit, label %.lr.ph
+
+.loopexit:                                        ; preds = %33, %thread-pre-split._crit_edge, %.preheader
+  br label %35
+
+; <label>:34                                      ; preds = %3
+  br label %35
+
+; <label>:35                                      ; preds = %34, %.loopexit
+  br i1 undef, label %37, label %36
+
+; <label>:36                                      ; preds = %35
+  br label %37
+
+; <label>:37                                      ; preds = %36, %35
+  ret void
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.6.0 "}

Added: llvm/trunk/test/Transforms/LoopVectorize/increment.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/increment.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/increment.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/increment.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,65 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK-LABEL: @inc(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; Can't vectorize this loop because the access to A[X] is non-linear.
+;
+;  for (i = 0; i < n; ++i) {
+;    A[B[i]]++;
+;
+;CHECK-LABEL: @histogram(
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @histogram(i32* nocapture noalias %A, i32* nocapture noalias %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 0
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/induction-step.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/induction-step.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/induction-step.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/induction-step.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,201 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=8 -S | FileCheck %s
+
+; int int_inc;
+;
+;int induction_with_global(int init, int *restrict A, int N) {
+;  int x = init;
+;  for (int i=0;i<N;i++){
+;    A[i] = x;
+;    x += int_inc;
+;  }
+;  return x;
+;}
+
+; CHECK-LABEL: @induction_with_global(
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @int_inc, align 4
+; CHECK:       vector.ph:
+; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP8:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> %vec.ind, <8 x i32>* [[TMP11]], align 4
+; CHECK:         %index.next = add i64 %index, 8
+; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+
+ at int_inc = common global i32 0, align 4
+
+define i32 @induction_with_global(i32 %init, i32* noalias nocapture %A, i32 %N) {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load i32, i32* @int_inc, align 4
+  %1 = mul i32 %0, %N
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %x.05 = phi i32 [ %init, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %x.05, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %x.05
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %2 = add i32 %1, %init
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %x.0.lcssa = phi i32 [ %init, %entry ], [ %2, %for.end.loopexit ]
+  ret i32 %x.0.lcssa
+}
+
+
+;int induction_with_loop_inv(int init, int *restrict A, int N, int M) {
+;  int x = init;
+;  for (int j = 0; j < M; j++) {
+;    for (int i=0; i<N; i++){
+;      A[i] = x;
+;      x += j; // induction step is a loop invariant variable
+;    }
+;  }
+;  return x;
+;}
+
+; CHECK-LABEL: @induction_with_loop_inv(
+; CHECK:       vector.ph:
+; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 %j.012, i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 %j.012, 8
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> %vec.ind, <8 x i32>* [[TMP9]], align 4
+; CHECK:         %index.next = add i64 %index, 8
+; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define i32 @induction_with_loop_inv(i32 %init, i32* noalias nocapture %A, i32 %N, i32 %M) {
+entry:
+  %cmp10 = icmp sgt i32 %M, 0
+  br i1 %cmp10, label %for.cond1.preheader.lr.ph, label %for.end6
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp27 = icmp sgt i32 %N, 0
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc4, %for.cond1.preheader.lr.ph
+  %indvars.iv15 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next16, %for.inc4 ]
+  %j.012 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc5, %for.inc4 ]
+  %x.011 = phi i32 [ %init, %for.cond1.preheader.lr.ph ], [ %x.1.lcssa, %for.inc4 ]
+  br i1 %cmp27, label %for.body3.preheader, label %for.inc4
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
+  %x.18 = phi i32 [ %add, %for.body3 ], [ %x.011, %for.body3.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %x.18, i32* %arrayidx, align 4
+  %add = add nsw i32 %x.18, %j.012
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.inc4.loopexit, label %for.body3
+
+for.inc4.loopexit:                                ; preds = %for.body3
+  %0 = add i32 %x.011, %indvars.iv15
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.inc4.loopexit, %for.cond1.preheader
+  %x.1.lcssa = phi i32 [ %x.011, %for.cond1.preheader ], [ %0, %for.inc4.loopexit ]
+  %inc5 = add nuw nsw i32 %j.012, 1
+  %indvars.iv.next16 = add i32 %indvars.iv15, %N
+  %exitcond17 = icmp eq i32 %inc5, %M
+  br i1 %exitcond17, label %for.end6.loopexit, label %for.cond1.preheader
+
+for.end6.loopexit:                                ; preds = %for.inc4
+  %x.1.lcssa.lcssa = phi i32 [ %x.1.lcssa, %for.inc4 ]
+  br label %for.end6
+
+for.end6:                                         ; preds = %for.end6.loopexit, %entry
+  %x.0.lcssa = phi i32 [ %init, %entry ], [ %x.1.lcssa.lcssa, %for.end6.loopexit ]
+  ret i32 %x.0.lcssa
+}
+
+
+; CHECK-LABEL: @non_primary_iv_loop_inv_trunc(
+; CHECK:       vector.ph:
+; CHECK:         [[TMP3:%.*]] = trunc i64 %step to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT6]]
+; CHECK-NEXT:    [[INDUCTION7:%.*]] = add <8 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:         [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
+; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], <8 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    %index.next = add i64 %index, 8
+; CHECK:         [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @non_primary_iv_loop_inv_trunc(i32* %a, i64 %n, i64 %step) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = trunc i64 %j to i32
+  store i32 %tmp1, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = add nuw nsw i64 %j, %step
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/induction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/induction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/induction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,896 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses -instcombine -S | FileCheck %s --check-prefix=INTERLEAVE
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure that we can handle multiple integer induction variables.
+;
+; CHECK-LABEL: @multi_int_induction(
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <2 x i32> [ <i32 190, i32 191>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP3:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* %A, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> %vec.ind, <2 x i32>* [[TMP6]], align 4
+; CHECK:         %index.next = add i64 %index, 2
+; CHECK-NEXT:    %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+define void @multi_int_induction(i32* %A, i32 %N) {
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %count.09 = phi i32 [ 190, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %count.09, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %count.09, 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we remove unneeded vectorization of induction variables.
+; In order for instcombine to cleanup the vectorized induction variables that we
+; create in the loop vectorizer we need to perform some form of redundancy
+; elimination to get rid of multiple uses.
+
+; IND-LABEL: scalar_use
+
+; IND:     br label %vector.body
+; IND:     vector.body:
+;   Vectorized induction variable.
+; IND-NOT:  insertelement <2 x i64>
+; IND-NOT:  shufflevector <2 x i64>
+; IND:     br {{.*}}, label %vector.body
+
+define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %ind.sum = add i64 %iv, %offset
+  %arr.idx = getelementptr inbounds float, float* %a, i64 %ind.sum
+  %l1 = load float, float* %arr.idx, align 4
+  %ind.sum2 = add i64 %iv, %offset2
+  %arr.idx2 = getelementptr inbounds float, float* %a, i64 %ind.sum2
+  %l2 = load float, float* %arr.idx2, align 4
+  %m = fmul fast float %b, %l2
+  %ad = fadd fast float %l1, %m
+  store float %ad, float* %arr.idx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %loopexit, label %for.body
+
+loopexit:
+  ret void
+}
+
+; Make sure we don't create a vector induction phi node that is unused.
+; Scalarize the step vectors instead.
+;
+; for (int i = 0; i < n; ++i)
+;   sum += a[i];
+;
+; CHECK-LABEL: @scalarize_induction_variable_01(
+; CHECK: vector.body:
+; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %[[i0:.+]] = add i64 %index, 0
+; CHECK:   getelementptr inbounds i64, i64* %a, i64 %[[i0]]
+;
+; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01(
+; UNROLL-NO-IC: vector.body:
+; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NO-IC:   %[[i0:.+]] = add i64 %index, 0
+; UNROLL-NO-IC:   %[[i2:.+]] = add i64 %index, 2
+; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i0]]
+; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i2]]
+;
+; IND-LABEL: @scalarize_induction_variable_01(
+; IND:     vector.body:
+; IND:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND-NOT:   add i64 {{.*}}, 2
+; IND:       getelementptr inbounds i64, i64* %a, i64 %index
+;
+; UNROLL-LABEL: @scalarize_induction_variable_01(
+; UNROLL:     vector.body:
+; UNROLL:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NOT:   add i64 {{.*}}, 4
+; UNROLL:       %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 %index
+; UNROLL:       getelementptr inbounds i64, i64* %[[g1]], i64 2
+
+define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %sum = phi i64 [ %2, %for.body ], [ 0, %entry ]
+  %0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %1 = load i64, i64* %0, align 8
+  %2 = add i64 %1, %sum
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3  = phi i64 [ %2, %for.body ]
+  ret i64 %3
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors.
+;
+; float s = 0;
+; for (int i ; 0; i < n; i += 8)
+;   s += (a[i] + b[i] + 1.0f);
+;
+; CHECK-LABEL: @scalarize_induction_variable_02(
+; CHECK: vector.body:
+; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %offset.idx = mul i64 %index, 8
+; CHECK:   %[[i0:.+]] = add i64 %offset.idx, 0
+; CHECK:   %[[i1:.+]] = add i64 %offset.idx, 8
+; CHECK:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; CHECK:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+; CHECK:   getelementptr inbounds float, float* %b, i64 %[[i0]]
+; CHECK:   getelementptr inbounds float, float* %b, i64 %[[i1]]
+;
+; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_02(
+; UNROLL-NO-IC: vector.body:
+; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NO-IC:   %offset.idx = mul i64 %index, 8
+; UNROLL-NO-IC:   %[[i0:.+]] = add i64 %offset.idx, 0
+; UNROLL-NO-IC:   %[[i1:.+]] = add i64 %offset.idx, 8
+; UNROLL-NO-IC:   %[[i2:.+]] = add i64 %offset.idx, 16
+; UNROLL-NO-IC:   %[[i3:.+]] = add i64 %offset.idx, 24
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i2]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i3]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i0]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i1]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i2]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i3]]
+;
+; IND-LABEL: @scalarize_induction_variable_02(
+; IND: vector.body:
+; IND:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND:   %[[i0:.+]] = shl i64 %index, 3
+; IND:   %[[i1:.+]] = or i64 %[[i0]], 8
+; IND:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; IND:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+;
+; UNROLL-LABEL: @scalarize_induction_variable_02(
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL:   %[[i0:.+]] = shl i64 %index, 3
+; UNROLL:   %[[i1:.+]] = or i64 %[[i0]], 8
+; UNROLL:   %[[i2:.+]] = or i64 %[[i0]], 16
+; UNROLL:   %[[i3:.+]] = or i64 %[[i0]], 24
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i2]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i3]]
+
+define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %s = phi float [ 0.0, %entry ], [ %6, %for.body ]
+  %0 = getelementptr inbounds float, float* %a, i64 %i
+  %1 = load float, float* %0, align 4
+  %2 = getelementptr inbounds float, float* %b, i64 %i
+  %3 = load float, float* %2, align 4
+  %4 = fadd fast float %s, 1.0
+  %5 = fadd fast float %4, %1
+  %6 = fadd fast float %5, %3
+  %i.next = add nuw nsw i64 %i, 8
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %s.lcssa = phi float [ %6, %for.body ]
+  ret float %s.lcssa
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors. (Interleaved accesses.)
+;
+; for (int i = 0; i < n; ++i)
+;   a[i].f ^= y;
+;
+; INTERLEAVE-LABEL: @scalarize_induction_variable_03(
+; INTERLEAVE: vector.body:
+; INTERLEAVE:   %[[i0:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTERLEAVE:   %[[i1:.+]] = or i64 %[[i0]], 1
+; INTERLEAVE:   %[[i2:.+]] = or i64 %[[i0]], 2
+; INTERLEAVE:   %[[i3:.+]] = or i64 %[[i0]], 3
+; INTERLEAVE:   %[[i4:.+]] = or i64 %[[i0]], 4
+; INTERLEAVE:   %[[i5:.+]] = or i64 %[[i0]], 5
+; INTERLEAVE:   %[[i6:.+]] = or i64 %[[i0]], 6
+; INTERLEAVE:   %[[i7:.+]] = or i64 %[[i0]], 7
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i0]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i1]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i2]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i3]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i4]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i5]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i6]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i7]], i32 1
+
+%pair.i32 = type { i32, i32 }
+define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %f = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  %0 = load i32, i32* %f, align 8
+  %1 = xor i32 %0, %y
+  store i32 %1, i32* %f, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors. (Interleaved accesses.)
+;
+; for (int i = 0; i < n; ++i)
+;   p[i].f = a[i * 4]
+;
+; INTERLEAVE-LABEL: @scalarize_induction_variable_04(
+; INTERLEAVE: vector.body:
+; INTERLEAVE:   %[[i0:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTERLEAVE:   %[[i1:.+]] = or i64 %[[i0]], 1
+; INTERLEAVE:   %[[i2:.+]] = or i64 %[[i0]], 2
+; INTERLEAVE:   %[[i3:.+]] = or i64 %[[i0]], 3
+; INTERLEAVE:   %[[i4:.+]] = or i64 %[[i0]], 4
+; INTERLEAVE:   %[[i5:.+]] = or i64 %[[i0]], 5
+; INTERLEAVE:   %[[i6:.+]] = or i64 %[[i0]], 6
+; INTERLEAVE:   %[[i7:.+]] = or i64 %[[i0]], 7
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i0]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i1]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i2]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i3]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i4]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i5]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i6]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i7]], i32 1
+
+define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry]
+  %0 = shl nsw i64 %i, 2
+  %1 = getelementptr inbounds i32, i32* %a, i64 %0
+  %2 = load i32, i32* %1, align 1
+  %3 = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %2, i32* %3, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %4 = trunc i64 %i.next to i32
+  %cond = icmp eq i32 %4, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; PR30542. Ensure we generate all the scalar steps for the induction variable.
+; The scalar induction variable is used by a getelementptr instruction
+; (uniform), and a udiv (non-uniform).
+;
+; int sum = 0;
+; for (int i = 0; i < n; ++i) {
+;   int x = a[i];
+;   if (c)
+;     x /= i;
+;   sum += x;
+; }
+;
+; CHECK-LABEL: @scalarize_induction_variable_05(
+; CHECK: vector.body:
+; CHECK:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue{{[0-9]+}} ]
+; CHECK:   %[[I0:.+]] = add i32 %index, 0
+; CHECK:   getelementptr inbounds i32, i32* %a, i32 %[[I0]]
+; CHECK: pred.udiv.if:
+; CHECK:   udiv i32 {{.*}}, %[[I0]]
+; CHECK: pred.udiv.if{{[0-9]+}}:
+; CHECK:   %[[I1:.+]] = add i32 %index, 1
+; CHECK:   udiv i32 {{.*}}, %[[I1]]
+;
+; UNROLL-NO_IC-LABEL: @scalarize_induction_variable_05(
+; UNROLL-NO-IC: vector.body:
+; UNROLL-NO-IC:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue{{[0-9]+}} ]
+; UNROLL-NO-IC:   %[[I0:.+]] = add i32 %index, 0
+; UNROLL-NO-IC:   %[[I2:.+]] = add i32 %index, 2
+; UNROLL-NO-IC:   getelementptr inbounds i32, i32* %a, i32 %[[I0]]
+; UNROLL-NO-IC:   getelementptr inbounds i32, i32* %a, i32 %[[I2]]
+; UNROLL-NO-IC: pred.udiv.if:
+; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I0]]
+; UNROLL-NO-IC: pred.udiv.if{{[0-9]+}}:
+; UNROLL-NO-IC:   %[[I1:.+]] = add i32 %index, 1
+; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I1]]
+; UNROLL-NO-IC: pred.udiv.if{{[0-9]+}}:
+; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I2]]
+; UNROLL-NO-IC: pred.udiv.if{{[0-9]+}}:
+; UNROLL-NO-IC:   %[[I3:.+]] = add i32 %index, 3
+; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I3]]
+;
+; IND-LABEL: @scalarize_induction_variable_05(
+; IND: vector.body:
+; IND:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue{{[0-9]+}} ]
+; IND:   %[[E0:.+]] = sext i32 %index to i64
+; IND:   getelementptr inbounds i32, i32* %a, i64 %[[E0]]
+; IND: pred.udiv.if:
+; IND:   udiv i32 {{.*}}, %index
+; IND: pred.udiv.if{{[0-9]+}}:
+; IND:   %[[I1:.+]] = or i32 %index, 1
+; IND:   udiv i32 {{.*}}, %[[I1]]
+;
+; UNROLL-LABEL: @scalarize_induction_variable_05(
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue{{[0-9]+}} ]
+; UNROLL:   %[[I2:.+]] = or i32 %index, 2
+; UNROLL:   %[[E0:.+]] = sext i32 %index to i64
+; UNROLL:   %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[E0]]
+; UNROLL:   getelementptr inbounds i32, i32* %[[G0]], i64 2
+; UNROLL: pred.udiv.if:
+; UNROLL:   udiv i32 {{.*}}, %index
+; UNROLL: pred.udiv.if{{[0-9]+}}:
+; UNROLL:   %[[I1:.+]] = or i32 %index, 1
+; UNROLL:   udiv i32 {{.*}}, %[[I1]]
+; UNROLL: pred.udiv.if{{[0-9]+}}:
+; UNROLL:   udiv i32 {{.*}}, %[[I2]]
+; UNROLL: pred.udiv.if{{[0-9]+}}:
+; UNROLL:   %[[I3:.+]] = or i32 %index, 3
+; UNROLL:   udiv i32 {{.*}}, %[[I3]]
+
+define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %if.end ]
+  %sum = phi i32 [ 0, %entry ], [ %tmp4, %if.end ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i32 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %if.end
+
+if.then:
+  %tmp2 = udiv i32 %tmp1, %i
+  br label %if.end
+
+if.end:
+  %tmp3 = phi i32 [ %tmp2, %if.then ], [ %tmp1, %for.body ]
+  %tmp4 = add i32 %tmp3, %sum
+  %i.next = add nuw nsw i32 %i, 1
+  %cond = icmp slt i32 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp5  = phi i32 [ %tmp4, %if.end ]
+  ret i32 %tmp5
+}
+
+; Ensure we generate both a vector and a scalar induction variable. In this
+; test, the induction variable is used by an instruction that will be
+; vectorized (trunc) as well as an instruction that will remain in scalar form
+; (gepelementptr).
+;
+; CHECK-LABEL: @iv_vector_and_scalar_users(
+; CHECK: vector.body:
+; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %vec.ind = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:   %vec.ind1 = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next2, %vector.body ]
+; CHECK:   %[[i0:.+]] = add i64 %index, 0
+; CHECK:   %[[i1:.+]] = add i64 %index, 1
+; CHECK:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i0]], i32 1
+; CHECK:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i1]], i32 1
+; CHECK:   %index.next = add i64 %index, 2
+; CHECK:   %vec.ind.next = add <2 x i64> %vec.ind, <i64 2, i64 2>
+; CHECK:   %vec.ind.next2 = add <2 x i32> %vec.ind1, <i32 2, i32 2>
+;
+; IND-LABEL: @iv_vector_and_scalar_users(
+; IND: vector.body:
+; IND:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND:   %vec.ind1 = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next2, %vector.body ]
+; IND:   %[[i1:.+]] = or i64 %index, 1
+; IND:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %index, i32 1
+; IND:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i1]], i32 1
+; IND:   %index.next = add i64 %index, 2
+; IND:   %vec.ind.next2 = add <2 x i32> %vec.ind1, <i32 2, i32 2>
+;
+; UNROLL-LABEL: @iv_vector_and_scalar_users(
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL:   %vec.ind2 = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next5, %vector.body ]
+; UNROLL:   %[[i1:.+]] = or i64 %index, 1
+; UNROLL:   %[[i2:.+]] = or i64 %index, 2
+; UNROLL:   %[[i3:.+]] = or i64 %index, 3
+; UNROLL:   %step.add3 = add <2 x i32> %vec.ind2, <i32 2, i32 2>
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %index, i32 1
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i1]], i32 1
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i2]], i32 1
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i3]], i32 1
+; UNROLL:   %index.next = add i64 %index, 4
+; UNROLL:   %vec.ind.next5 = add <2 x i32> %vec.ind2, <i32 4, i32 4>
+
+%pair.i16 = type { i16, i16 }
+define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %0 = trunc i64 %i to i32
+  %1 = add i32 %a, %0
+  %2 = trunc i32 %1 to i16
+  %3 = getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %i, i32 1
+  store i16 %2, i16* %3, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %4 = trunc i64 %i.next to i32
+  %cond = icmp eq i32 %4, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Make sure that the loop exit count computation does not overflow for i8 and
+; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the
+; induction variable to a bigger type the exit count computation will overflow
+; to 0.
+; PR17532
+
+; CHECK-LABEL: i8_loop
+; CHECK: icmp eq i32 {{.*}}, 256
+define i32 @i8_loop() nounwind readnone ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i8 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i8 %b.0, -1
+  %4 = icmp eq i8 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
+
+; CHECK-LABEL: i16_loop
+; CHECK: icmp eq i32 {{.*}}, 65536
+
+define i32 @i16_loop() nounwind readnone ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i16 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i16 %b.0, -1
+  %4 = icmp eq i16 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
+
+; This loop has a backedge taken count of i32_max. We need to check for this
+; condition and branch directly to the scalar loop.
+
+; CHECK-LABEL: max_i32_backedgetaken
+; CHECK:  br i1 true, label %scalar.ph, label %vector.ph
+
+; CHECK: middle.block:
+; CHECK:  %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0
+; CHECK: scalar.ph:
+; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ %[[v9]], %middle.block ]
+
+define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
+
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i32 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i32 %b.0, -1
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
+
+; When generating the overflow check we must sure that the induction start value
+; is defined before the branch to the scalar preheader.
+
+; CHECK-LABEL: testoverflowcheck
+; CHECK: entry
+; CHECK: %[[LOAD:.*]] = load i8
+; CHECK: br
+
+; CHECK: scalar.ph
+; CHECK: phi i8 [ %{{.*}}, %middle.block ], [ %[[LOAD]], %entry ]
+
+ at e = global i8 1, align 1
+ at d = common global i32 0, align 4
+ at c = common global i32 0, align 4
+define i32 @testoverflowcheck() {
+entry:
+  %.pr.i = load i8, i8* @e, align 1
+  %0 = load i32, i32* @d, align 4
+  %c.promoted.i = load i32, i32* @c, align 4
+  br label %cond.end.i
+
+cond.end.i:
+  %inc4.i = phi i8 [ %.pr.i, %entry ], [ %inc.i, %cond.end.i ]
+  %and3.i = phi i32 [ %c.promoted.i, %entry ], [ %and.i, %cond.end.i ]
+  %and.i = and i32 %0, %and3.i
+  %inc.i = add i8 %inc4.i, 1
+  %tobool.i = icmp eq i8 %inc.i, 0
+  br i1 %tobool.i, label %loopexit, label %cond.end.i
+
+loopexit:
+  ret i32 %and.i
+}
+
+; The SCEV expression of %sphi is (zext i8 {%t,+,1}<%loop> to i32)
+; In order to recognize %sphi as an induction PHI and vectorize this loop,
+; we need to convert the SCEV expression into an AddRecExpr.
+; The expression gets converted to {zext i8 %t to i32,+,1}.
+
+; CHECK-LABEL: wrappingindvars1
+; CHECK-LABEL: vector.scevcheck
+; CHECK-LABEL: vector.ph
+; CHECK: %[[START:.*]] = add <2 x i32> %{{.*}}, <i32 0, i32 1>
+; CHECK-LABEL: vector.body
+; CHECK: %[[PHI:.*]] = phi <2 x i32> [ %[[START]], %vector.ph ], [ %[[STEP:.*]], %vector.body ]
+; CHECK: %[[STEP]] = add <2 x i32> %[[PHI]], <i32 2, i32 2>
+define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
+ entry:
+  %st = zext i8 %t to i16
+  %ext = zext i8 %t to i32
+  %ecmp = icmp ult i16 %st, 42
+  br i1 %ecmp, label %loop, label %exit
+
+ loop:
+
+  %idx = phi i8 [ %t, %entry ], [ %idx.inc, %loop ]
+  %idx.b = phi i32 [ 0, %entry ], [ %idx.b.inc, %loop ]
+  %sphi = phi i32 [ %ext, %entry ], [%idx.inc.ext, %loop]
+
+  %ptr = getelementptr inbounds i32, i32* %A, i8 %idx
+  store i32 %sphi, i32* %ptr
+
+  %idx.inc = add i8 %idx, 1
+  %idx.inc.ext = zext i8 %idx.inc to i32
+  %idx.b.inc = add nuw nsw i32 %idx.b, 1
+
+  %c = icmp ult i32 %idx.b, %len
+  br i1 %c, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+; The SCEV expression of %sphi is (4 * (zext i8 {%t,+,1}<%loop> to i32))
+; In order to recognize %sphi as an induction PHI and vectorize this loop,
+; we need to convert the SCEV expression into an AddRecExpr.
+; The expression gets converted to ({4 * (zext %t to i32),+,4}).
+; CHECK-LABEL: wrappingindvars2
+; CHECK-LABEL: vector.scevcheck
+; CHECK-LABEL: vector.ph
+; CHECK: %[[START:.*]] = add <2 x i32> %{{.*}}, <i32 0, i32 4>
+; CHECK-LABEL: vector.body
+; CHECK: %[[PHI:.*]] = phi <2 x i32> [ %[[START]], %vector.ph ], [ %[[STEP:.*]], %vector.body ]
+; CHECK: %[[STEP]] = add <2 x i32> %[[PHI]], <i32 8, i32 8>
+define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
+
+entry:
+  %st = zext i8 %t to i16
+  %ext = zext i8 %t to i32
+  %ext.mul = mul i32 %ext, 4
+
+  %ecmp = icmp ult i16 %st, 42
+  br i1 %ecmp, label %loop, label %exit
+
+ loop:
+
+  %idx = phi i8 [ %t, %entry ], [ %idx.inc, %loop ]
+  %sphi = phi i32 [ %ext.mul, %entry ], [%mul, %loop]
+  %idx.b = phi i32 [ 0, %entry ], [ %idx.b.inc, %loop ]
+
+  %ptr = getelementptr inbounds i32, i32* %A, i8 %idx
+  store i32 %sphi, i32* %ptr
+
+  %idx.inc = add i8 %idx, 1
+  %idx.inc.ext = zext i8 %idx.inc to i32
+  %mul = mul i32 %idx.inc.ext, 4
+  %idx.b.inc = add nuw nsw i32 %idx.b, 1
+
+  %c = icmp ult i32 %idx.b, %len
+  br i1 %c, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+; Check that we generate vectorized IVs in the pre-header
+; instead of widening the scalar IV inside the loop, when
+; we know how to do that.
+; IND-LABEL: veciv
+; IND: vector.body:
+; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; IND: %index.next = add i32 %index, 2
+; IND: %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; IND: %[[CMP:.*]] = icmp eq i32 %index.next
+; IND: br i1 %[[CMP]]
+; UNROLL-LABEL: veciv
+; UNROLL: vector.body:
+; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; UNROLL: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; UNROLL: %index.next = add i32 %index, 4
+; UNROLL: %vec.ind.next = add <2 x i32> %vec.ind, <i32 4, i32 4>
+; UNROLL: %[[CMP:.*]] = icmp eq i32 %index.next
+; UNROLL: br i1 %[[CMP]]
+define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
+  store i32 %indvars.iv, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %k
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; IND-LABEL: trunciv
+; IND: vector.body:
+; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %[[VECIND:.*]] = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %[[STEPADD:.*]], %vector.body ]
+; IND: %index.next = add i64 %index, 2
+; IND: %[[STEPADD]] = add <2 x i32> %[[VECIND]], <i32 2, i32 2>
+; IND: %[[CMP:.*]] = icmp eq i64 %index.next
+; IND: br i1 %[[CMP]]
+define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %trunc.iv = trunc i64 %indvars.iv to i32
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %trunc.iv
+  store i32 %trunc.iv, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %k
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @nonprimary(
+; CHECK: vector.ph:
+; CHECK:   %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
+; CHECK:   %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK:   %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 1>
+; CHECK: vector.body:
+; CHECK:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:   %offset.idx = add i32 %i, %index
+; CHECK:   %[[A1:.*]] = add i32 %offset.idx, 0
+; CHECK:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A1]]
+; CHECK:   %[[G3:.*]] = getelementptr inbounds i32, i32* %[[G1]], i32 0
+; CHECK:   %[[B1:.*]] = bitcast i32* %[[G3]] to <2 x i32>*
+; CHECK:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
+; CHECK:   %index.next = add i32 %index, 2
+; CHECK:   %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; CHECK:   %[[CMP:.*]] = icmp eq i32 %index.next, %n.vec
+; CHECK:   br i1 %[[CMP]]
+;
+; IND-LABEL: @nonprimary(
+; IND: vector.ph:
+; IND:   %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
+; IND:   %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; IND:   %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 1>
+; IND: vector.body:
+; IND:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND:   %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; IND:   %[[A1:.*]] = add i32 %index, %i
+; IND:   %[[S1:.*]] = sext i32 %[[A1]] to i64
+; IND:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i64 %[[S1]]
+; IND:   %[[B1:.*]] = bitcast i32* %[[G1]] to <2 x i32>*
+; IND:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
+; IND:   %index.next = add i32 %index, 2
+; IND:   %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; IND:   %[[CMP:.*]] = icmp eq i32 %index.next, %n.vec
+; IND:   br i1 %[[CMP]]
+;
+; UNROLL-LABEL: @nonprimary(
+; UNROLL: vector.ph:
+; UNROLL:   %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
+; UNROLL:   %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; UNROLL:   %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 1>
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL:   %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; UNROLL:   %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; UNROLL:   %[[A1:.*]] = add i32 %index, %i
+; UNROLL:   %[[S1:.*]] = sext i32 %[[A1]] to i64
+; UNROLL:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i64 %[[S1]]
+; UNROLL:   %[[B1:.*]] = bitcast i32* %[[G1]] to <2 x i32>*
+; UNROLL:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
+; UNROLL:   %[[G2:.*]] = getelementptr inbounds i32, i32* %[[G1]], i64 2
+; UNROLL:   %[[B2:.*]] = bitcast i32* %[[G2]] to <2 x i32>*
+; UNROLL:   store <2 x i32> %step.add, <2 x i32>* %[[B2]]
+; UNROLL:   %index.next = add i32 %index, 4
+; UNROLL:   %vec.ind.next = add <2 x i32> %vec.ind, <i32 4, i32 4>
+; UNROLL:   %[[CMP:.*]] = icmp eq i32 %index.next, %n.vec
+; UNROLL:   br i1 %[[CMP]]
+define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ %i, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
+  store i32 %indvars.iv, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %k
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @non_primary_iv_trunc(
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:         [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK:         [[TMP3:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* %a, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    %index.next = add i64 %index, 2
+; CHECK:         [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+define void @non_primary_iv_trunc(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = trunc i64 %j to i32
+  store i32 %tmp1, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = add nuw nsw i64 %j, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR32419. Ensure we transform truncated non-primary induction variables. In
+; the test case below we replace %tmp1 with a new induction variable. Because
+; the truncated value is non-primary, we must compute an offset from the
+; primary induction variable.
+;
+; CHECK-LABEL: @PR32419(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE4:.*]] ]
+; CHECK:         [[OFFSET_IDX:%.*]] = add i32 -20, [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
+; CHECK:         [[TMP8:%.*]] = add i16 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = urem i16 %b, [[TMP8]]
+; CHECK:         [[TMP15:%.*]] = add i16 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = urem i16 %b, [[TMP15]]
+; CHECK:       [[PRED_UREM_CONTINUE4]]:
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @PR32419(i32 %a, i16 %b) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ -20, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = phi i32 [ %a, %entry ], [ %tmp6, %for.inc ]
+  %tmp1 = trunc i32 %i to i16
+  %tmp2 = icmp eq i16 %tmp1, 0
+  br i1 %tmp2, label %for.inc, label %for.cond
+
+for.cond:
+  %tmp3 = urem i16 %b, %tmp1
+  br label %for.inc
+
+for.inc:
+  %tmp4 = phi i16 [ %tmp3, %for.cond ], [ 0, %for.body ]
+  %tmp5 = sext i16 %tmp4 to i32
+  %tmp6 = or i32 %tmp0, %tmp5
+  %i.next = add nsw i32 %i, 1
+  %cond = icmp eq i32 %i.next, 0
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
+
+; Ensure that the shuffle vector for first order recurrence is inserted
+; correctly after all the phis. These new phis correspond to new IVs 
+; that are generated by optimizing non-free truncs of IVs to IVs themselves 
+define i64 @trunc_with_first_order_recurrence() {
+; CHECK-LABEL: trunc_with_first_order_recurrence
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.phi = phi <2 x i64>
+; CHECK-NEXT:    %vec.ind = phi <2 x i64> [ <i64 1, i64 2>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind2 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next3, %vector.body ]
+; CHECK-NEXT:    %vector.recur = phi <2 x i32> [ <i32 undef, i32 42>, %vector.ph ], [ %vec.ind5, %vector.body ]
+; CHECK-NEXT:    %vec.ind5 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next6, %vector.body ]
+; CHECK-NEXT:    %vec.ind7 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next8, %vector.body ]
+; CHECK-NEXT:    shufflevector <2 x i32> %vector.recur, <2 x i32> %vec.ind5, <2 x i32> <i32 1, i32 2>
+entry:
+  br label %loop
+
+exit:                                        ; preds = %loop
+  %.lcssa = phi i64 [ %c23, %loop ]
+  ret i64 %.lcssa
+
+loop:                                         ; preds = %loop, %entry
+  %c5 = phi i64 [ %c23, %loop ], [ 0, %entry ]
+  %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 1, %entry ]
+  %x = phi i32 [ %c24, %loop ], [ 1, %entry ]
+  %y = phi i32 [ %c6, %loop ], [ 42, %entry ]
+  %c6 = trunc i64 %indvars.iv to i32
+  %c8 = mul i32 %x, %c6
+  %c9 = add i32 %c8, 42
+  %c10 = add i32 %y, %c6
+  %c11 = add i32 %c10, %c9
+  %c12 = sext i32 %c11 to i64
+  %c13 = add i64 %c5, %c12
+  %indvars.iv.tr = trunc i64 %indvars.iv to i32
+  %c14 = shl i32 %indvars.iv.tr, 1
+  %c15 = add i32 %c9, %c14
+  %c16 = sext i32 %c15 to i64
+  %c23 = add i64 %c13, %c16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %c24 = add nuw nsw i32 %x, 1
+  %exitcond.i = icmp eq i64 %indvars.iv.next, 114
+  br i1 %exitcond.i, label %exit, label %loop
+
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/induction_plus.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at array = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @array_at_plus_one(
+;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK: %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+;CHECK: %vec.ind1 = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next2, %vector.body ]
+;CHECK: %[[T1:.+]] = add i64 %index, 0
+;CHECK: %[[T2:.+]] = add nsw i64 %[[T1]], 12
+;CHECK: getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 %[[T2]]
+;CHECK: %vec.ind.next = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+;CHECK: %vec.ind.next2 = add <4 x i32> %vec.ind1, <i32 4, i32 4, i32 4, i32 4>
+;CHECK: ret i32
+define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = add nsw i64 %indvars.iv, 12
+  %3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 %2
+  %4 = trunc i64 %indvars.iv to i32
+  store i32 %4, i32* %3, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/infiniteloop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/infiniteloop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/infiniteloop.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/infiniteloop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt -S -indvars -loop-vectorize -force-vector-width=2  < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+
+ at a = common global i64 0, align 8
+ at x = common global i32 0, align 4
+
+; We used to assert on this loop because we could not find an induction
+; variable but assumed there must be one. Scalar evolution returned a exit
+; count for the loop below and from there on we assumed that there must be an
+; induction variable. This is not a valid assumption:
+;   // getExitCount - Get the expression for the number of loop iterations for
+;   // which this loop is *guaranteed not to exit* via ExitingBlock. Otherwise
+;   // return SCEVCouldNotCompute.
+; For an infinite loop SE can return any number.
+
+; CHECK-LABEL: @fn1(
+define void @fn1()  {
+entry:
+  store i64 0, i64* @a, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %inc1 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  store volatile i32 0, i32* @x, align 4
+  %inc = add nsw i64 %inc1, 1
+  %cmp = icmp sgt i64 %inc1, -2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %inc.lcssa = phi i64 [ %inc, %for.body ]
+  store i64 %inc.lcssa, i64* @a, align 8
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/int_sideeffect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/int_sideeffect.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/int_sideeffect.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/int_sideeffect.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt -S < %s -loop-vectorize -force-vector-width=4 | FileCheck %s
+
+declare void @llvm.sideeffect()
+
+; Vectorization across a @llvm.sideeffect.
+
+; CHECK-LABEL: store_ones
+; CHECK: store <4 x float>
+define void @store_ones(float* %p, i64 %n) nounwind {
+bb7.lr.ph:
+  br label %bb7
+
+bb7:
+  %i.02 = phi i64 [ 0, %bb7.lr.ph ], [ %tmp13, %bb7 ]
+  call void @llvm.sideeffect()
+  %tmp10 = getelementptr inbounds float, float* %p, i64 %i.02
+  store float 1.0, float* %tmp10, align 4
+  %tmp13 = add i64 %i.02, 1
+  %tmp6 = icmp ult i64 %tmp13, %n
+  br i1 %tmp6, label %bb7, label %bb14
+
+bb14:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-1.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,78 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the interleaved-mem-access analysis identifies the access
+; to array 'in' as interleaved, despite the possibly wrapping unsigned
+; 'out_ix' index.
+;
+; In this test the interleave-groups are full (have no gaps), so no wrapping
+; checks are necessary. We can call getPtrStride with Assume=false and
+; ShouldCheckWrap=false to safely figure out that the stride is 2.
+
+; #include <stdlib.h>
+; class Complex {
+; private:
+;  float real_;
+;  float imaginary_;
+;
+;public:
+; Complex() : real_(0), imaginary_(0) { }
+; Complex(float real, float imaginary) : real_(real), imaginary_(imaginary) { }
+; Complex(const Complex &rhs) : real_(rhs.real()), imaginary_(rhs.imaginary()) { }
+;
+; inline float real() const { return real_; }
+; inline float imaginary() const { return imaginary_; }
+;};
+;
+;void test(Complex * __restrict__ out, Complex * __restrict__ in, size_t out_start, size_t size)
+;{
+;   for (size_t out_offset = 0; out_offset < size; ++out_offset)
+;     {
+;       size_t out_ix = out_start + out_offset;
+;       Complex t0 = in[out_ix];
+;       out[out_ix] = t0;
+;     }
+;}
+
+; CHECK: vector.body:
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+%class.Complex = type { float, float }
+
+define void @_Z4testP7ComplexS0_mm(%class.Complex* noalias nocapture %out, %class.Complex* noalias nocapture readonly %in, i64 %out_start, i64 %size) local_unnamed_addr {
+entry:
+  %cmp9 = icmp eq i64 %size, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %out_offset.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %add = add i64 %out_offset.010, %out_start
+  %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %in, i64 %add
+  %0 = bitcast %class.Complex* %arrayidx to i32*
+  %1 = load i32, i32* %0, align 4
+  %imaginary_.i.i = getelementptr inbounds %class.Complex, %class.Complex* %in, i64 %add, i32 1
+  %2 = bitcast float* %imaginary_.i.i to i32*
+  %3 = load i32, i32* %2, align 4
+  %arrayidx1 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add
+  %4 = bitcast %class.Complex* %arrayidx1 to i64*
+  %t0.sroa.4.0.insert.ext = zext i32 %3 to i64
+  %t0.sroa.4.0.insert.shift = shl nuw i64 %t0.sroa.4.0.insert.ext, 32
+  %t0.sroa.0.0.insert.ext = zext i32 %1 to i64
+  %t0.sroa.0.0.insert.insert = or i64 %t0.sroa.4.0.insert.shift, %t0.sroa.0.0.insert.ext
+  store i64 %t0.sroa.0.0.insert.insert, i64* %4, align 4
+  %inc = add nuw i64 %out_offset.010, 1
+  %exitcond = icmp eq i64 %inc, %size
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-2.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,58 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the interleaved-mem-access analysis currently does not create an 
+; interleave group for the access to array 'in' due to the possibly wrapping 
+; unsigned 'out_ix' index.
+;
+; In this test the interleave-group of the loads is not full (has gaps), so 
+; the wrapping checks are necessary. Here this cannot be done statically so 
+; runtime checks are needed, but with Assume=false getPtrStride cannot add 
+; runtime checks and as a result we can't create the interleave-group.
+;
+; FIXME: This is currently a missed optimization until we can use Assume=true 
+; with proper threshold checks. Once we do that the candidate interleave-group
+; will not be invalidated by the wrapping checks.
+
+; #include <stdlib.h>
+; void test(float * __restrict__ out, float * __restrict__ in, size_t size)
+; {
+;    for (size_t out_offset = 0; out_offset < size; ++out_offset)
+;      {
+;        float t0 = in[2*out_offset];
+;        out[out_offset] = t0;
+;      }
+; }
+
+; CHECK: vector.body:
+; CHECK-NOT: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+
+define void @_Z4testPfS_m(float* noalias nocapture %out, float* noalias nocapture readonly %in, i64 %size) local_unnamed_addr {
+entry:
+  %cmp7 = icmp eq i64 %size, 0
+  br i1 %cmp7, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %out_offset.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl i64 %out_offset.08, 1
+  %arrayidx = getelementptr inbounds float, float* %in, i64 %mul
+  %0 = bitcast float* %arrayidx to i32*
+  %1 = load i32, i32* %0, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %out, i64 %out_offset.08
+  %2 = bitcast float* %arrayidx1 to i32*
+  store i32 %1, i32* %2, align 4
+  %inc = add nuw i64 %out_offset.08, 1
+  %exitcond = icmp eq i64 %inc, %size
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-3.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-3.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-3.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,57 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the interleaved-mem-access analysis currently does not create an 
+; interleave group for access 'a' due to the possible pointer wrap-around.
+;
+; To begin with, in this test the candidate interleave group can be created 
+; only when getPtrStride is called with Assume=true. Next, because
+; the interleave-group of the loads is not full (has gaps), we also need to check 
+; for possible pointer wrapping. Here we currently use Assume=false and as a 
+; result cannot prove the transformation is safe and therefore invalidate the
+; candidate interleave group.
+;
+; FIXME: This is a missed optimization. Once we use Assume=true here, we will
+; not have to invalidate the group.
+
+; void func(unsigned * __restrict a, unsigned * __restrict b, unsigned char x, unsigned char y) {
+;  int i = 0;
+;  for (unsigned char index = x; i < y; index +=2, ++i)
+;    b[i] = a[index] * 2;
+;
+; }
+
+; CHECK: vector.body:
+; CHECK-NOT: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+
+define void @_Z4funcPjS_hh(i32* noalias nocapture readonly %a, i32* noalias nocapture %b, i8 zeroext %x, i8 zeroext %y) local_unnamed_addr {
+entry:
+  %cmp9 = icmp eq i8 %y, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  %wide.trip.count = zext i8 %y to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %index.011 = phi i8 [ %add, %for.body ], [ %x, %for.body.preheader ]
+  %idxprom = zext i8 %index.011 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl i32 %0, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  %add = add i8 %index.011, 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,63 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
+
+; When merging two stores with interleaved access vectorization, make sure we
+; propagate the alias information from all scalar stores to form the most
+; generic alias info.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+%struct.Vec4r = type { double, double, double, double }
+%struct.Vec2r = type { double, double }
+
+define void @foobar(%struct.Vec4r* nocapture readonly %p, i32 %i)
+{
+entry:
+  %cp = alloca [20 x %struct.Vec2r], align 8
+  %0 = bitcast [20 x %struct.Vec2r]* %cp to i8*
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %arraydecay = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* %cp, i64 0, i64 0
+  call void @g(%struct.Vec2r* nonnull %arraydecay) #4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.Vec4r, %struct.Vec4r* %p, i64 %indvars.iv, i32 0
+  %1 = load double, double* %x, align 8, !tbaa !3
+  %mul = fmul double %1, 2.000000e+00
+  %x4 = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* %cp, i64 0, i64 %indvars.iv, i32 0
+
+; The new store should alias any double rather than one of the fields of Vec2r.
+; CHECK: store <4 x double> {{.*}} !tbaa ![[STORE_TBAA:[0-9]+]]
+; CHECK-DAG: ![[DOUBLE_TBAA:[0-9]+]] = !{!"double", !{{[0-9+]}}, i64 0}
+; CHECK-DAG: ![[STORE_TBAA]] = !{![[DOUBLE_TBAA]], ![[DOUBLE_TBAA]], i64 0}
+  store double %mul, double* %x4, align 8, !tbaa !8
+  %y = getelementptr inbounds %struct.Vec4r, %struct.Vec4r* %p, i64 %indvars.iv, i32 1
+  %2 = load double, double* %y, align 8, !tbaa !10
+  %mul7 = fmul double %2, 3.000000e+00
+  %y10 = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* %cp, i64 0, i64 %indvars.iv, i32 1
+  store double %mul7, double* %y10, align 8, !tbaa !11
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare void @g(%struct.Vec2r*)
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang version 6.0.0 (trunk 319007) (llvm/trunk 319324)"}
+!3 = !{!4, !5, i64 0}
+!4 = !{!"Vec4r", !5, i64 0, !5, i64 8, !5, i64 16, !5, i64 24}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!9, !5, i64 0}
+!9 = !{!"Vec2r", !5, i64 0, !5, i64 8}
+!10 = !{!4, !5, i64 8}
+!11 = !{!9, !5, i64 8}

Added: llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,222 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; We test here that the loop-vectorizer forms an interleave-groups from 
+; predicated memory accesses only if they are both in the same (predicated)
+; block (first scenario below).
+; If the accesses are not in the same predicated block, an interleave-group
+; is not formed (scenarios 2,3 below).
+
+; Scenario 1: Check the case where it is legal to create masked interleave-
+; groups. Altogether two groups are created (one for loads and one for stores)
+; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses
+; are disabled we do not create any interleave-group.
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  store i8  %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:   %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with   %{{.*}} = load i8, i8* %{{.*}}, align 1
+
+; Scenario 2: Check the case where it is illegal to create a masked interleave-
+; group because the first access is predicated, and the second isn't.
+; We therefore create a separate interleave-group with gaps for each of the
+; stores (if masked-interleaved-accesses are enabled) and these are later
+; invalidated because interleave-groups of stores with gaps are not supported. 
+; If masked-interleaved-accesses is not enabled we create only one interleave
+; group of stores (for the non-predicated store) and it is later invalidated
+; due to gaps.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     q[2*ix+1] = 2;
+; }
+;}
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; Scenario 3: Check the case where it is illegal to create a masked interleave-
+; group because the two accesses are in separate predicated blocks.
+; We therefore create a separate interleave-group with gaps for each of the accesses,
+; (which are later invalidated because interleave-groups of stores with gaps are 
+; not supported).
+; If masked-interleaved-accesses is not enabled we don't create any interleave
+; group because all accesses are predicated.
+;
+; void masked_strided3(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     if (ix > guard2) {
+;         q[2*ix+1] = 2;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; ModuleID = 'test.c'
+source_filename = "test.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.012, 1
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  %cmp1 = icmp ugt i32 %ix.012, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.012, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard1 to i32
+  %conv3 = zext i8 %guard2 to i32
+  br label %for.body
+
+for.body:
+  %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.018, 1
+  %cmp1 = icmp ugt i32 %ix.018, %conv
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  br label %if.end
+
+if.end:
+  %cmp4 = icmp ugt i32 %ix.018, %conv3
+  br i1 %cmp4, label %if.then6, label %for.inc
+
+if.then6:
+  %add = or i32 %mul, 1
+  %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx7, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.018, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = {  "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"  }

Added: llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,165 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+%pair = type { i64, i64 }
+
+; Ensure that we vectorize the interleaved load group even though the loop
+; contains a conditional store. The store group contains gaps and is not
+; vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_0(
+;
+; CHECK: vector.ph
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %wide.vec, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+
+define void @interleaved_with_cond_store_0(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.1, align 8
+  br label %if.merge
+
+if.merge:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Ensure that we don't form a single interleaved group for the two loads. The
+; conditional store prevents the second load from being hoisted. The two load
+; groups are separately vectorized. The store group contains gaps and is not
+; vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_1(
+;
+; CHECK: vector.ph
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+;
+; CHECK: pred.store.continue
+; CHECK:   %[[L2:.+]] = load <4 x i64>, <4 x i64>* {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <4 x i64> %[[L2]], i32 0
+; CHECK:   store i64 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <4 x i64> %[[L2]], i32 2
+; CHECK:   store i64 %[[X4]], {{.*}}
+
+define void @interleaved_with_cond_store_1(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.0, align 8
+  br label %if.merge
+
+if.merge:
+  %2 = load i64, i64* %p.0, align 8
+  store i64 %2, i64 *%p.1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Ensure that we don't create a single interleaved group for the two stores.
+; The second store is conditional and we can't sink the first store inside the
+; predicated block. The load group is vectorized, and the store groups contain
+; gaps and are not vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_2(
+;
+; CHECK: vector.ph
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+; CHECK:   store i64 %x, {{.*}}
+; CHECK:   store i64 %x, {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+
+define void @interleaved_with_cond_store_2(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  store i64 %x, i64* %p.0, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.1, align 8
+  br label %if.merge
+
+if.merge:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,921 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Check vectorization on an interleaved load group of factor 2 and an interleaved
+; store group of factor 2.
+
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+
+; CHECK-LABEL: @test_array_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+ at AB = common global [1024 x i32] zeroinitializer, align 4
+ at CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx0, align 4
+  %tmp1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
+  %tmp2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %tmp, %C
+  %mul = mul nsw i32 %tmp2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; int A[3072];
+; struct ST S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     T[i].x = X1 + 1;
+;     T[i].y = X2 + 2;
+;     T[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_array_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
+
+%struct.ST3 = type { i32, i32, i32 }
+ at A = common global [3072 x i32] zeroinitializer, align 4
+ at S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %tmp = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %tmp1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %tmp2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %tmp, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %tmp1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %tmp2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check vectorization on an interleaved load group of factor 4.
+
+; struct ST4{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST4 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load4(
+; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %tmp = load i32, i32* %x, align 4
+  %add = add nsw i32 %tmp, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %tmp1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %tmp1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %tmp2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %tmp2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %tmp3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %tmp3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+; Check vectorization on an interleaved store group of factor 4.
+
+; void test_struct_store4(int *A, struct ST4 *B) {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     B[i].x = X + 1;
+;     B[i].y = X * 2;
+;     B[i].z = X + 3;
+;     B[i].w = X + 4;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_store4(
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
+
+define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
+  %tmp = load i32, i32* %ptr.024, align 4
+  %add = add nsw i32 %tmp, 1
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %tmp, 1
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %add3 = add nsw i32 %tmp, 3
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
+  store i32 %add3, i32* %z, align 4
+  %add6 = add nsw i32 %tmp, 4
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
+  store i32 %add6, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check vectorization on a reverse interleaved load group of factor 2 and
+; a reverse interleaved store group of factor 2.
+
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = A[i].x + i;  // interleaved load of index 0
+;     int b = A[i].y - i;  // interleaved load of index 1
+;     B[i].x = a;          // interleaved store of index 0
+;     B[i].y = b;          // interleaved store of index 1
+;   }
+; }
+
+; CHECK-LABEL: @test_reversed_load2_store2(
+; CHECK: %[[G0:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %offset.idx, i32 0
+; CHECK: %[[G1:.+]] = getelementptr inbounds i32, i32* %[[G0]], i64 -6
+; CHECK: %[[B0:.+]] = bitcast i32* %[[G1]] to <8 x i32>*
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+; CHECK: %[[G2:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %offset.idx, i32 1
+; CHECK: %[[G3:.+]] = getelementptr inbounds i32, i32* %[[G2]], i64 -7
+; CHECK: %[[B1:.+]] = bitcast i32* %[[G3]] to <8 x i32>*
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %[[B1]], align 4
+
+%struct.ST2 = type { i32, i32 }
+
+define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %tmp = load i32, i32* %x, align 4
+  %tmp1 = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %tmp, %tmp1
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %tmp2 = load i32, i32* %y, align 4
+  %sub = sub nsw i32 %tmp2, %tmp1
+  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x5, align 4
+  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
+  store i32 %sub, i32* %y8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on an interleaved load group of factor 2 with 1 gap
+; (missing the load of odd elements). Because the vectorized loop would
+; speculatively access memory out-of-bounds, we must execute at least one
+; iteration of the scalar loop.
+
+; void even_load_static_tc(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK-LABEL: @even_load_static_tc(
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   icmp eq i64 %index.next, 508
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
+
+define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %tmp, 1
+  %tmp1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on an interleaved load group of factor 2 with 1 gap
+; (missing the load of odd elements). Because the vectorized loop would
+; speculatively access memory out-of-bounds, we must execute at least one
+; iteration of the scalar loop.
+
+; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
+;  for (unsigned i = 0; i < N; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK-LABEL: @even_load_dynamic_tc(
+; CHECK: vector.ph:
+; CHECK:   %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   icmp eq i64 %index.next, %n.vec
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
+
+define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %tmp, 1
+  %tmp1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, %N
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on a reverse interleaved load group of factor 2 with 1
+; gap and a reverse interleaved store group of factor 2. The interleaved load
+; group should be removed since it has a gap and is reverse.
+
+; struct pair {
+;  int x;
+;  int y;
+; };
+;
+; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = X + i;
+;     int b = A[i].y - i;
+;     B[i].x = a;
+;     B[i].y = b;
+;   }
+; }
+
+; CHECK-LABEL: @load_gap_reverse(
+; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8
+; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+
+%pair = type { i64, i64 }
+define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
+  %0 = add nsw i64 %X, %i
+  %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
+  %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
+  %3 = load i64, i64* %2, align 8
+  %4 = sub nsw i64 %3, %i
+  store i64 %0, i64* %1, align 8
+  store i64 %4, i64* %2, align 8
+  %i.next = add nsw i64 %i, -1
+  %cond = icmp sgt i64 %i, 0
+  br i1 %cond, label %for.body, label %for.exit
+
+for.exit:
+  ret void
+}
+
+; Check vectorization on interleaved access groups identified from mixed
+; loads/stores.
+; void mixed_load2_store2(int *A, int *B) {
+;   for (unsigned i = 0; i < 1024; i+=2)  {
+;     B[i] = A[i] * A[i+1];
+;     B[i+1] = A[i] + A[i+1];
+;   }
+; }
+
+; CHECK-LABEL: @mixed_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec
+
+define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %tmp1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
+  %tmp2 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %tmp2, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %tmp3 = load i32, i32* %arrayidx, align 4
+  %tmp4 = load i32, i32* %arrayidx2, align 4
+  %add10 = add nsw i32 %tmp4, %tmp3
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
+  store i32 %add10, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on interleaved access groups identified from mixed
+; loads/stores.
+; void mixed_load3_store3(int *A) {
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     *A++ += i;
+;     *A++ += i;
+;     *A++ += i;
+;   }
+; }
+
+; CHECK-LABEL: @mixed_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> %{{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* %{{.*}}, align 4
+
+define void @mixed_load3_store3(i32* nocapture %A) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
+  %tmp = load i32, i32* %A.addr.012, align 4
+  %add = add i32 %tmp, %i.013
+  store i32 %add, i32* %A.addr.012, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
+  %tmp1 = load i32, i32* %incdec.ptr, align 4
+  %add2 = add i32 %tmp1, %i.013
+  store i32 %add2, i32* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
+  %tmp2 = load i32, i32* %incdec.ptr1, align 4
+  %add4 = add i32 %tmp2, %i.013
+  store i32 %add4, i32* %incdec.ptr1, align 4
+  %inc = add nuw nsw i32 %i.013, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check vectorization on interleaved access groups with members having different
+; kinds of type.
+
+; struct IntFloat {
+;   int a;
+;   float b;
+; };
+; 
+; int SA;
+; float SB;
+;
+; void int_float_struct(struct IntFloat *A) {
+;   int SumA;
+;   float SumB;
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     SumA += A[i].a;
+;     SumB += A[i].b;
+;   }
+;   SA = SumA;
+;   SB = SumB;
+; }
+
+; CHECK-LABEL: @int_float_struct(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
+; CHECK: add nsw <4 x i32>
+; CHECK: fadd fast <4 x float>
+
+%struct.IntFloat = type { i32, float }
+
+ at SA = common global i32 0, align 4
+ at SB = common global float 0.000000e+00, align 4
+
+define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  store i32 %add, i32* @SA, align 4
+  store float %add3, float* @SB, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
+  %tmp = load i32, i32* %a, align 4
+  %add = add nsw i32 %tmp, %SumA.013
+  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
+  %tmp1 = load float, float* %b, align 4
+  %add3 = fadd fast float %SumB.014, %tmp1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check vectorization of interleaved access groups in the presence of
+; dependences (PR27626). The following tests check that we don't reorder
+; dependent loads and stores when generating code for interleaved access
+; groups. Stores should be scalarized because the required code motion would
+; break dependences, and the remaining interleaved load groups should have
+; gaps.
+
+; PR27626_0: Ensure a strided store is not moved after a dependent (zero
+;            distance) strided load.
+
+; void PR27626_0(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i].x = z;
+;     p[i].y = p[i].x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_0(
+; CHECK: vector.ph:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK:   store i32 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK:   store i32 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK:   store i32 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK:   store i32 %[[X4]], {{.*}}
+
+%pair.i32 = type { i32, i32 }
+define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %z, i32* %p_i.x, align 4
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32 *%p_i.y, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_1: Ensure a strided load is not moved before a dependent (zero
+;            distance) strided store.
+
+; void PR27626_1(struct pair *p, int n) {
+;   int s = 0;
+;   for (int i = 0; i < n; i++) {
+;     p[i].y = p[i].x;
+;     s += p[i].y
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_1(
+; CHECK: vector.ph:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
+; CHECK:   store i32 %[[X1:.+]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
+; CHECK:   store i32 %[[X2:.+]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
+; CHECK:   store i32 %[[X3:.+]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
+; CHECK:   store i32 %[[X4:.+]], {{.*}}
+; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
+
+define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32* %p_i.y, align 4
+  %1 = load i32, i32* %p_i.y, align 4
+  %2 = add nsw i32 %1, %s
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3 = phi i32 [ %2, %for.body ]
+  ret i32 %3
+}
+
+; PR27626_2: Ensure a strided store is not moved after a dependent (negative
+;            distance) strided load.
+
+; void PR27626_2(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i].x = z;
+;     p[i].y = p[i - 1].x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_2(
+; CHECK: vector.ph:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK:   store i32 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK:   store i32 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK:   store i32 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK:   store i32 %[[X4]], {{.*}}
+
+define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %i_minus_1 = add nuw nsw i64 %i, -1
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %z, i32* %p_i.x, align 4
+  %0 = load i32, i32* %p_i_minus_1.x, align 4
+  store i32 %0, i32 *%p_i.y, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_3: Ensure a strided load is not moved before a dependent (negative
+;            distance) strided store.
+
+; void PR27626_3(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i + 1].y = p[i].x;
+;     s += p[i].y;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_3(
+; CHECK: vector.ph:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
+; CHECK:   store i32 %[[X1:.+]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
+; CHECK:   store i32 %[[X2:.+]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
+; CHECK:   store i32 %[[X3:.+]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
+; CHECK:   store i32 %[[X4:.+]], {{.*}}
+; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
+
+define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
+  %i_plus_1 = add nuw nsw i64 %i, 1
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32* %p_i_plus_1.y, align 4
+  %1 = load i32, i32* %p_i.y, align 4
+  %2 = add nsw i32 %1, %s
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3 = phi i32 [ %2, %for.body ]
+  ret i32 %3
+}
+
+; PR27626_4: Ensure we form an interleaved group for strided stores in the
+;            presence of a write-after-write dependence. We create a group for
+;            (2) and (3) while excluding (1).
+
+; void PR27626_4(int *a, int x, int y, int z, int n) {
+;   for (int i = 0; i < n; i += 2) {
+;     a[i] = x;      // (1)
+;     a[i] = y;      // (2)
+;     a[i + 1] = z;  // (3)
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_4(
+; CHECK: vector.ph:
+; CHECK:   %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0
+; CHECK:   %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:   %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0
+; CHECK:   %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: vector.body:
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK:   store <8 x i32> %[[VEC]], {{.*}}
+
+define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %i_plus_1 = add i64 %i, 1
+  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
+  %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
+  store i32 %x, i32* %a_i, align 4
+  store i32 %y, i32* %a_i, align 4
+  store i32 %z, i32* %a_i_plus_1, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_5: Ensure we do not form an interleaved group for strided stores in
+;            the presence of a write-after-write dependence.
+
+; void PR27626_5(int *a, int x, int y, int z, int n) {
+;   for (int i = 3; i < n; i += 2) {
+;     a[i - 1] = x;
+;     a[i - 3] = y;
+;     a[i] = z;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_5(
+; CHECK: vector.body:
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+
+define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
+  %i_minus_1 = sub i64 %i, 1
+  %i_minus_3 = sub i64 %i_minus_1, 2
+  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
+  %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
+  %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
+  store i32 %x, i32* %a_i_minus_1, align 4
+  store i32 %y, i32* %a_i_minus_3, align 4
+  store i32 %z, i32* %a_i, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR34743: Ensure that a cast which needs to sink after a load that belongs to
+; an interleaved group, indeeded gets sunk.
+
+; void PR34743(short *a, int *b, int n) {
+;   for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
+;     b[i] = a[iv] * a[iv+1] * a[iv+2];
+;   }
+; }
+
+; CHECK-LABEL: @PR34743(
+; CHECK: vector.body:
+; CHECK:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[VSHUF1:.+]], %vector.body ]
+; CHECK:   %wide.vec = load <8 x i16>
+; CHECK:   %[[VSHUF0:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   %[[VSHUF1:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK:   %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %[[VSHUF1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:   sext <4 x i16> %[[VSHUF0]] to <4 x i32>
+; CHECK:   sext <4 x i16> %[[VSHUF]] to <4 x i32>
+; CHECK:   sext <4 x i16> %[[VSHUF1]] to <4 x i32>
+; CHECK:   mul nsw <4 x i32>
+; CHECK:   mul nsw <4 x i32>
+
+define void @PR34743(i16* %a, i32* %b, i64 %n) {
+entry:
+  %.pre = load i16, i16* %a
+  br label %loop
+
+loop:
+  %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
+  %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
+  %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
+  %conv = sext i16 %0 to i32
+  %i1 = add nuw nsw i64 %i, 1
+  %iv1 = add nuw nsw i64 %iv, 1
+  %iv2 = add nuw nsw i64 %iv, 2
+  %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1
+  %load1 = load i16, i16* %gep1, align 4
+  %conv1 = sext i16 %load1 to i32
+  %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2
+  %load2 = load i16, i16* %gep2, align 4
+  %conv2 = sext i16 %load2 to i32
+  %mul01 = mul nsw i32 %conv, %conv1
+  %mul012 = mul nsw i32 %mul01, %conv2
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  store i32 %mul012, i32* %arrayidx5
+  %exitcond = icmp eq i64 %iv, %n
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret void
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }

Added: llvm/trunk/test/Transforms/LoopVectorize/interleaved-acess-with-remarks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/interleaved-acess-with-remarks.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/interleaved-acess-with-remarks.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/interleaved-acess-with-remarks.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,43 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 --pass-remarks=loop-vectorize < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+
+; This only tests that asking for remarks doesn't lead to compiler crashing
+; (or timing out). We just check for output. To be sure, we also check we didn't
+; vectorize.
+; CHECK-LABEL: @atomicLoadsBothWriteAndReadMem
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+%"struct.std::__atomic_base" = type { i32 }
+%"struct.std::atomic" = type { %"struct.std::__atomic_base" }
+%union.anon = type { i64 }
+%MyStruct = type { i32, %"struct.std::atomic", %union.anon }
+
+define void @atomicLoadsBothWriteAndReadMem(%MyStruct *%a, %MyStruct *%b, %MyStruct *%lim) {
+entry:
+  br label %loop
+
+loop:
+  %0 = phi %MyStruct* [ %a, %entry ], [ %ainc, %loop ]
+  %1 = phi %MyStruct* [ %b, %entry ], [ %binc, %loop ]
+  %2 = getelementptr %MyStruct, %MyStruct* %1, i64 0, i32 0
+  %3 = load i32, i32* %2, align 8
+  %4 = getelementptr inbounds %MyStruct, %MyStruct* %0, i64 0, i32 0
+  store i32 %3, i32* %4, align 8
+  %5 = getelementptr inbounds %MyStruct, %MyStruct* %1, i64 0, i32 1, i32 0, i32 0
+  %6 = load atomic i32, i32* %5 monotonic, align 4
+  %7 = getelementptr inbounds %MyStruct, %MyStruct* %0, i64 0, i32 1, i32 0, i32 0
+  store atomic i32 %6, i32* %7 monotonic, align 4
+  %8 = getelementptr inbounds %MyStruct, %MyStruct* %1, i64 0, i32 2, i32 0
+  %9 = getelementptr inbounds %MyStruct, %MyStruct* %0, i64 0, i32 2, i32 0
+  %10 = load i64, i64* %8, align 8
+  store i64 %10, i64* %9, align 8
+  %binc = getelementptr inbounds %MyStruct, %MyStruct* %1, i64 1
+  %ainc = getelementptr inbounds %MyStruct, %MyStruct* %0, i64 1
+  %cond = icmp eq %MyStruct* %binc, %lim
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/intrinsic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/intrinsic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/intrinsic.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/intrinsic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,1357 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @sqrt_f32(
+;CHECK: llvm.sqrt.v4f32
+;CHECK: ret void
+define void @sqrt_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.sqrt.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+;CHECK-LABEL: @sqrt_f64(
+;CHECK: llvm.sqrt.v4f64
+;CHECK: ret void
+define void @sqrt_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.sqrt.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+;CHECK-LABEL: @sin_f32(
+;CHECK: llvm.sin.v4f32
+;CHECK: ret void
+define void @sin_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.sin.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.sin.f32(float) nounwind readnone
+
+;CHECK-LABEL: @sin_f64(
+;CHECK: llvm.sin.v4f64
+;CHECK: ret void
+define void @sin_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.sin.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.sin.f64(double) nounwind readnone
+
+;CHECK-LABEL: @cos_f32(
+;CHECK: llvm.cos.v4f32
+;CHECK: ret void
+define void @cos_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.cos.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.cos.f32(float) nounwind readnone
+
+;CHECK-LABEL: @cos_f64(
+;CHECK: llvm.cos.v4f64
+;CHECK: ret void
+define void @cos_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.cos.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.cos.f64(double) nounwind readnone
+
+;CHECK-LABEL: @exp_f32(
+;CHECK: llvm.exp.v4f32
+;CHECK: ret void
+define void @exp_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.exp.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.exp.f32(float) nounwind readnone
+
+;CHECK-LABEL: @exp_f64(
+;CHECK: llvm.exp.v4f64
+;CHECK: ret void
+define void @exp_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.exp.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.exp.f64(double) nounwind readnone
+
+;CHECK-LABEL: @exp2_f32(
+;CHECK: llvm.exp2.v4f32
+;CHECK: ret void
+define void @exp2_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.exp2.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.exp2.f32(float) nounwind readnone
+
+;CHECK-LABEL: @exp2_f64(
+;CHECK: llvm.exp2.v4f64
+;CHECK: ret void
+define void @exp2_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.exp2.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.exp2.f64(double) nounwind readnone
+
+;CHECK-LABEL: @log_f32(
+;CHECK: llvm.log.v4f32
+;CHECK: ret void
+define void @log_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.log.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.log.f32(float) nounwind readnone
+
+;CHECK-LABEL: @log_f64(
+;CHECK: llvm.log.v4f64
+;CHECK: ret void
+define void @log_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.log.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.log.f64(double) nounwind readnone
+
+;CHECK-LABEL: @log10_f32(
+;CHECK: llvm.log10.v4f32
+;CHECK: ret void
+define void @log10_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.log10.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.log10.f32(float) nounwind readnone
+
+;CHECK-LABEL: @log10_f64(
+;CHECK: llvm.log10.v4f64
+;CHECK: ret void
+define void @log10_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.log10.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.log10.f64(double) nounwind readnone
+
+;CHECK-LABEL: @log2_f32(
+;CHECK: llvm.log2.v4f32
+;CHECK: ret void
+define void @log2_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.log2.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.log2.f32(float) nounwind readnone
+
+;CHECK-LABEL: @log2_f64(
+;CHECK: llvm.log2.v4f64
+;CHECK: ret void
+define void @log2_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.log2.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.log2.f64(double) nounwind readnone
+
+;CHECK-LABEL: @fabs_f32(
+;CHECK: llvm.fabs.v4f32
+;CHECK: ret void
+define void @fabs_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.fabs.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+define void @fabs_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.fabs(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.fabs(double) nounwind readnone
+
+;CHECK-LABEL: @copysign_f32(
+;CHECK: llvm.copysign.v4f32
+;CHECK: ret void
+define void @copysign_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx1, align 4
+  %call = tail call float @llvm.copysign.f32(float %0, float %1) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.copysign.f32(float, float) nounwind readnone
+
+define void @copysign_f64(i32 %n, double* noalias %y, double* noalias %x, double* noalias %z) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds double, double* %z, i64 %indvars.iv
+  %1 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.copysign(double %0, double %1) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.copysign(double, double) nounwind readnone
+
+;CHECK-LABEL: @floor_f32(
+;CHECK: llvm.floor.v4f32
+;CHECK: ret void
+define void @floor_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.floor.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.floor.f32(float) nounwind readnone
+
+;CHECK-LABEL: @floor_f64(
+;CHECK: llvm.floor.v4f64
+;CHECK: ret void
+define void @floor_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.floor.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.floor.f64(double) nounwind readnone
+
+;CHECK-LABEL: @ceil_f32(
+;CHECK: llvm.ceil.v4f32
+;CHECK: ret void
+define void @ceil_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.ceil.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.ceil.f32(float) nounwind readnone
+
+;CHECK-LABEL: @ceil_f64(
+;CHECK: llvm.ceil.v4f64
+;CHECK: ret void
+define void @ceil_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.ceil.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.ceil.f64(double) nounwind readnone
+
+;CHECK-LABEL: @trunc_f32(
+;CHECK: llvm.trunc.v4f32
+;CHECK: ret void
+define void @trunc_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.trunc.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.trunc.f32(float) nounwind readnone
+
+;CHECK-LABEL: @trunc_f64(
+;CHECK: llvm.trunc.v4f64
+;CHECK: ret void
+define void @trunc_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.trunc.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.trunc.f64(double) nounwind readnone
+
+;CHECK-LABEL: @rint_f32(
+;CHECK: llvm.rint.v4f32
+;CHECK: ret void
+define void @rint_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.rint.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.rint.f32(float) nounwind readnone
+
+;CHECK-LABEL: @rint_f64(
+;CHECK: llvm.rint.v4f64
+;CHECK: ret void
+define void @rint_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.rint.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.rint.f64(double) nounwind readnone
+
+;CHECK-LABEL: @nearbyint_f32(
+;CHECK: llvm.nearbyint.v4f32
+;CHECK: ret void
+define void @nearbyint_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.nearbyint.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.nearbyint.f32(float) nounwind readnone
+
+;CHECK-LABEL: @nearbyint_f64(
+;CHECK: llvm.nearbyint.v4f64
+;CHECK: ret void
+define void @nearbyint_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.nearbyint.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.nearbyint.f64(double) nounwind readnone
+
+;CHECK-LABEL: @round_f32(
+;CHECK: llvm.round.v4f32
+;CHECK: ret void
+define void @round_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.round.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.round.f32(float) nounwind readnone
+
+;CHECK-LABEL: @round_f64(
+;CHECK: llvm.round.v4f64
+;CHECK: ret void
+define void @round_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.round.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.round.f64(double) nounwind readnone
+
+;CHECK-LABEL: @fma_f32(
+;CHECK: llvm.fma.v4f32
+;CHECK: ret void
+define void @fma_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z, float* noalias %w) nounwind uwtable {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %w, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %3 = tail call float @llvm.fma.f32(float %0, float %2, float %1)
+  %arrayidx6 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %3, float* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+
+;CHECK-LABEL: @fma_f64(
+;CHECK: llvm.fma.v4f64
+;CHECK: ret void
+define void @fma_f64(i32 %n, double* noalias %y, double* noalias %x, double* noalias %z, double* noalias %w) nounwind uwtable {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %w, i64 %indvars.iv
+  %1 = load double, double* %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %z, i64 %indvars.iv
+  %2 = load double, double* %arrayidx4, align 8
+  %3 = tail call double @llvm.fma.f64(double %0, double %2, double %1)
+  %arrayidx6 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %3, double* %arrayidx6, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
+
+;CHECK-LABEL: @fmuladd_f32(
+;CHECK: llvm.fmuladd.v4f32
+;CHECK: ret void
+define void @fmuladd_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z, float* noalias %w) nounwind uwtable {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %w, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %2, float %1)
+  %arrayidx6 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %3, float* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
+
+;CHECK-LABEL: @fmuladd_f64(
+;CHECK: llvm.fmuladd.v4f64
+;CHECK: ret void
+define void @fmuladd_f64(i32 %n, double* noalias %y, double* noalias %x, double* noalias %z, double* noalias %w) nounwind uwtable {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %w, i64 %indvars.iv
+  %1 = load double, double* %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %z, i64 %indvars.iv
+  %2 = load double, double* %arrayidx4, align 8
+  %3 = tail call double @llvm.fmuladd.f64(double %0, double %2, double %1)
+  %arrayidx6 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %3, double* %arrayidx6, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
+
+;CHECK-LABEL: @pow_f32(
+;CHECK: llvm.pow.v4f32
+;CHECK: ret void
+define void @pow_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.pow.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.pow.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @pow_f64(
+;CHECK: llvm.pow.v4f64
+;CHECK: ret void
+define void @pow_f64(i32 %n, double* noalias %y, double* noalias %x, double* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %z, i64 %indvars.iv
+  %1 = load double, double* %arrayidx2, align 8
+  %call = tail call double @llvm.pow.f64(double %0, double %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; CHECK: fabs_libm
+; CHECK:  call <4 x float> @llvm.fabs.v4f32
+; CHECK: ret void
+define void @fabs_libm(float* nocapture %x) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @fabsf(float %0) nounwind readnone
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare float @fabsf(float) nounwind readnone
+
+declare double @llvm.pow.f64(double, double) nounwind readnone
+
+
+
+; Make sure we don't replace calls to functions with standard library function
+; signatures but defined with internal linkage.
+
+define internal float @roundf(float %x) nounwind readnone {
+  ret float 0.00000000
+}
+; CHECK-LABEL: internal_round
+; CHECK-NOT:  load <4 x float>
+
+define void @internal_round(float* nocapture %x) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @roundf(float %0) nounwind readnone
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure we don't replace calls to functions with standard library names but
+; different signatures.
+
+declare void @round(double %f)
+
+; CHECK-LABEL: wrong_signature
+; CHECK-NOT:  load <4 x double>
+
+define void @wrong_signature(double* nocapture %x) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  store double %0, double* %arrayidx, align 4
+  tail call void @round(double %0) nounwind readnone
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare double @llvm.powi.f64(double %Val, i32 %power) nounwind readnone
+
+;CHECK-LABEL: @powi_f64(
+;CHECK: llvm.powi.v4f64
+;CHECK: ret void
+define void @powi_f64(i32 %n, double* noalias %y, double* noalias %x, i32 %P) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.powi.f64(double %0, i32  %P) nounwind readnone
+  %arrayidx4 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @powi_f64_neg(
+;CHECK-NOT: llvm.powi.v4f64
+;CHECK: ret void
+define void @powi_f64_neg(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %1 = trunc i64 %indvars.iv to i32
+  %call = tail call double @llvm.powi.f64(double %0, i32  %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64  @llvm.cttz.i64 (i64, i1) nounwind readnone
+
+;CHECK-LABEL: @cttz_f64(
+;CHECK: llvm.cttz.v4i64
+;CHECK: ret void
+define void @cttz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64, i64* %y, i64 %indvars.iv
+  %0 = load i64, i64* %arrayidx, align 8
+  %call = tail call i64 @llvm.cttz.i64(i64 %0, i1 true) nounwind readnone
+  %arrayidx4 = getelementptr inbounds i64, i64* %x, i64 %indvars.iv
+  store i64 %call, i64* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64  @llvm.ctlz.i64 (i64, i1) nounwind readnone
+
+;CHECK-LABEL: @ctlz_f64(
+;CHECK: llvm.ctlz.v4i64
+;CHECK: ret void
+define void @ctlz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64, i64* %y, i64 %indvars.iv
+  %0 = load i64, i64* %arrayidx, align 8
+  %call = tail call i64 @llvm.ctlz.i64(i64 %0, i1 true) nounwind readnone
+  %arrayidx4 = getelementptr inbounds i64, i64* %x, i64 %indvars.iv
+  store i64 %call, i64* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @llvm.fshl.i32 (i32, i32, i32)
+
+define void @fshl_i32(i32 %n, i32* noalias %x, i32* noalias %y, i32 %shAmt) {
+; CHECK-LABEL: @fshl_i32(
+; CHECK:         call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]], <4 x i32> [[SPLAT:%.*]])
+; CHECK:         ret void
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %end
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %xi = getelementptr inbounds i32, i32* %x, i32 %iv
+  %yi = getelementptr inbounds i32, i32* %y, i32 %iv
+  %xld = load i32, i32* %xi, align 4
+  %yld = load i32, i32* %yi, align 4
+  %call = tail call i32 @llvm.fshl.i32(i32 %xld, i32 %yld, i32 %shAmt)
+  store i32 %call, i32* %xi, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret void
+}
+
+declare i32 @llvm.fshr.i32 (i32, i32, i32)
+
+define void @fshr_i32(i32 %n, i32* noalias %x, i32* noalias %y, i32 %shAmt) {
+; CHECK-LABEL: @fshr_i32(
+; CHECK:         call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]], <4 x i32> [[SPLAT:%.*]])
+; CHECK:         ret void
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %end
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %xi = getelementptr inbounds i32, i32* %x, i32 %iv
+  %yi = getelementptr inbounds i32, i32* %y, i32 %iv
+  %xld = load i32, i32* %xi, align 4
+  %yld = load i32, i32* %yi, align 4
+  %call = tail call i32 @llvm.fshr.i32(i32 %xld, i32 %yld, i32 %shAmt)
+  store i32 %call, i32* %xi, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret void
+}
+
+declare float @llvm.minnum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @minnum_f32(
+;CHECK: llvm.minnum.v4f32
+;CHECK: ret void
+define void @minnum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.minnum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.maxnum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @maxnum_f32(
+;CHECK: llvm.maxnum.v4f32
+;CHECK: ret void
+define void @maxnum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.maxnum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.minimum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @minimum_f32(
+;CHECK: llvm.minimum.v4f32
+;CHECK: ret void
+define void @minimum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.minimum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.maximum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @maximum_f32(
+;CHECK: llvm.maximum.v4f32
+;CHECK: ret void
+define void @maximum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.maximum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/invariant-store-vectorization.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/invariant-store-vectorization.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/invariant-store-vectorization.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,593 @@
+; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+; First licm pass is to hoist/sink invariant stores if possible. Today LICM does
+; not hoist/sink the invariant stores. Even if that changes, we should still
+; vectorize this loop in case licm is not run.
+
+; The next licm pass after vectorization is to hoist/sink loop invariant
+; instructions.
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; all tests check that it is legal to vectorize the stores to invariant
+; address.
+
+
+; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction(
+; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b)
+; CHECK: vector.memcheck:
+; CHECK:    found.conflict
+
+; CHECK-LABEL: vector.body:
+; CHECK:         %vec.phi = phi <4 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         %wide.load = load <4 x i32>
+; CHECK:         [[ADD]] = add <4 x i32> %vec.phi, %wide.load
+; CHECK-NEXT:    store i32 %ntrunc, i32* %a
+; CHECK-NEXT:    %index.next = add i64 %index, 4
+; CHECK-NEXT:    icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1
+
+; CHECK-LABEL: middle.block:
+; CHECK:         %rdx.shuf = shufflevector <4 x i32>
+define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  store i32 %ntrunc, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; CHECK-LABEL: inv_val_store_to_inv_address(
+; CHECK-LABEL: vector.body:
+; CHECK:         store i32 %ntrunc, i32* %a
+; CHECK:         store <4 x i32>
+; CHECK-NEXT:    %index.next = add i64 %index, 4
+; CHECK-NEXT:    icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1
+define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %ntrunc, i32* %a
+  store i32 %ntrunc, i32* %tmp1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+; Both of these tests below are handled as predicated stores.
+
+; Conditional store
+; if (b[i] == k) a = ntrunc
+; TODO: We can be better with the code gen for the first test and we can have
+; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1.
+
+; CHECK-LABEL:inv_val_store_to_inv_address_conditional(
+; CHECK-LABEL: vector.body:
+; CHECK:           %wide.load = load <4 x i32>, <4 x i32>*
+; CHECK:           [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}}
+; CHECK:           store <4 x i32>
+; CHECK-NEXT:      [[EE:%[a-zA-Z0-9.]+]] =  extractelement <4 x i1> [[CMP]], i32 0
+; CHECK-NEXT:      br i1 [[EE]], label %pred.store.if, label %pred.store.continue
+
+; CHECK-LABEL: pred.store.if:
+; CHECK-NEXT:      store i32 %ntrunc, i32* %a
+; CHECK-NEXT:      br label %pred.store.continue
+
+; CHECK-LABEL: pred.store.continue:
+; CHECK-NEXT:      [[EE1:%[a-zA-Z0-9.]+]] =  extractelement <4 x i1> [[CMP]], i32 1
+define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; if (b[i] == k)
+;    a = ntrunc
+; else a = k;
+; TODO: We could vectorize this once we support multiple uniform stores to the
+; same address.
+; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values(
+; CHECK-NOT:           load <4 x i32>
+define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+cond_store_k:
+  store i32 %k, i32 * %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Instcombine'd version of above test. Now the store is no longer of invariant
+; value.
+; scalar store the value extracted from the last element of the vector value.
+; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32> [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[A]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       cond_store_k:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]
+; CHECK-NEXT:    store i32 [[STOREVAL]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  br label %latch
+
+cond_store_k:
+  br label %latch
+
+latch:
+  %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
+  store i32 %storeval, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; invariant val stored to invariant address predicated on invariant condition
+; This is not treated as a predicated store since the block the store belongs to
+; is the latch block (which doesn't need to be predicated).
+; variant/invariant values being stored to invariant address.
+; test checks that the last element of the phi is extracted and scalar stored
+; into the uniform address within the loop.
+; Since the condition and the phi is loop invariant, they are LICM'ed after
+; vectorization.
+; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[K]], i32 3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[A]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       cond_store_k:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]
+; CHECK-NEXT:    store i32 [[STOREVAL]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  %cmp = icmp eq i32 %ntrunc, %k
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  br label %latch
+
+cond_store_k:
+  br label %latch
+
+latch:
+  %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
+  store i32 %storeval, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; variant value stored to uniform address tests that the code gen extracts the
+; last element from the variant vector and scalar stores it into the uniform
+; address.
+; CHECK-LABEL: variant_val_store_to_inv_address
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A1]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX3:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[DOTLCSSA]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[DOTLCSSA]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  %cmp = icmp eq i32 %ntrunc, %k
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %tmp2, i32* %a
+  %tmp3 = add i32 %tmp0, %tmp2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %rdx.lcssa = phi i32 [ %tmp3, %for.body ]
+  ret i32 %rdx.lcssa
+}
+
+; Multiple variant stores to the same uniform address
+; We do not vectorize such loops currently.
+;  for(; i < itr; i++) {
+;    for(; j < itr; j++) {
+;      var1[i] = var2[j] + var1[i];
+;      var1[i]++;
+;    }
+;  }
+
+; CHECK-LABEL: multiple_uniform_stores
+; CHECK-NOT:     <4 x i32>
+define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+entry:
+  %cmp20 = icmp eq i32 %itr, 0
+  br i1 %cmp20, label %for.end10, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.inc8 ], [ 0, %entry ]
+  %j.022 = phi i32 [ %j.1.lcssa, %for.inc8 ], [ 0, %entry ]
+  %cmp218 = icmp ult i32 %j.022, %itr
+  br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
+  %0 = zext i32 %j.022 to i64
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, i32* %arrayidx5, align 4
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nsw i32 %3, 1
+  store i32 %4, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %itr
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3, %for.cond1.preheader
+  %j.1.lcssa = phi i32 [ %j.022, %for.cond1.preheader ], [ %itr, %for.body3 ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next24 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, %itr
+  br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
+
+for.end10:                                        ; preds = %for.inc8, %entry
+  ret i32 undef
+}
+
+; second uniform store to the same address is conditional.
+; we do not vectorize this.
+; CHECK-LABEL: multiple_uniform_stores_conditional
+; CHECK-NOT:    <4 x i32>
+define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+entry:
+  %cmp20 = icmp eq i32 %itr, 0
+  br i1 %cmp20, label %for.end10, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.inc8 ], [ 0, %entry ]
+  %j.022 = phi i32 [ %j.1.lcssa, %for.inc8 ], [ 0, %entry ]
+  %cmp218 = icmp ult i32 %j.022, %itr
+  br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
+  %0 = zext i32 %j.022 to i64
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %latch ]
+  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, i32* %arrayidx5, align 4
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nsw i32 %3, 1
+  %5 = icmp ugt i32 %3, 42
+  br i1 %5, label %cond_store, label %latch
+
+cond_store:
+  store i32 %4, i32* %arrayidx5, align 4
+  br label %latch
+
+latch:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %itr
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3, %for.cond1.preheader
+  %j.1.lcssa = phi i32 [ %j.022, %for.cond1.preheader ], [ %itr, %latch ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next24 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, %itr
+  br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
+
+for.end10:                                        ; preds = %for.inc8, %entry
+  ret i32 undef
+}
+
+; cannot vectorize loop with unsafe dependency between uniform load (%tmp10) and store
+; (%tmp12) to the same address
+; PR39653
+; Note: %tmp10 could be replaced by phi(%arg4, %tmp12), a potentially vectorizable
+; 1st-order-recurrence
+define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) {
+; CHECK-LABEL: unsafe_dep_uniform_load_store
+; CHECK-NOT: <4 x i32>
+bb:
+  %tmp = alloca i32
+  store i32 %arg4, i32* %tmp
+  %tmp6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5
+  br label %bb7
+
+bb7:
+  %tmp8 = phi i64 [ 0, %bb ], [ %tmp24, %bb7 ]
+  %tmp9 = phi i32 [ %arg1, %bb ], [ %tmp23, %bb7 ]
+  %tmp10 = load i32, i32* %tmp
+  %tmp11 = mul nsw i32 %tmp9, %tmp10
+  %tmp12 = srem i32 %tmp11, 65536
+  %tmp13 = add nsw i32 %tmp12, %tmp9
+  %tmp14 = trunc i32 %tmp13 to i16
+  %tmp15 = trunc i64 %tmp8 to i32
+  %tmp16 = add i32 %arg, %tmp15
+  %tmp17 = zext i32 %tmp16 to i64
+  %tmp18 = getelementptr inbounds i16, i16* %tmp6, i64 %tmp17
+  store i16 %tmp14, i16* %tmp18, align 2
+  %tmp19 = add i32 %tmp13, %tmp9
+  %tmp20 = trunc i32 %tmp19 to i16
+  %tmp21 = and i16 %tmp20, 255
+  %tmp22 = getelementptr inbounds i16, i16* %arg3, i64 %tmp17
+  store i16 %tmp21, i16* %tmp22, align 2
+  %tmp23 = add nsw i32 %tmp9, 1
+  %tmp24 = add nuw nsw i64 %tmp8, 1
+  %tmp25 = icmp eq i64 %tmp24, %arg2
+  store i32 %tmp12, i32* %tmp
+  br i1 %tmp25, label %bb26, label %bb7
+
+bb26:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,176 @@
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck %s
+
+; CHECK-LABEL: @postinc
+; CHECK-LABEL: scalar.ph:
+; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ]
+; CHECK-LABEL: for.end:
+; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %n.vec, %middle.block ]
+; CHECK: ret i32 %[[RET]]
+define i32 @postinc(i32 %k)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc
+}
+
+; CHECK-LABEL: @preinc
+; CHECK-LABEL: middle.block:
+; CHECK: %[[v3:.+]] = sub i32 %n.vec, 1
+; CHECK-LABEL: scalar.ph:
+; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ]
+; CHECK-LABEL: for.end:
+; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %[[v3]], %middle.block ]
+; CHECK: ret i32 %[[RET]]
+define i32 @preinc(i32 %k)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc.phi
+}
+
+; CHECK-LABEL: @constpre
+; CHECK-LABEL: for.end:
+; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ 2, %middle.block ]
+; CHECK: ret i32 %[[RET]]
+define i32 @constpre()  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 32, %entry ], [ %inc, %for.body ]
+  %inc = sub nsw i32 %inc.phi, 2
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc.phi
+}
+
+; CHECK-LABEL: @geppre
+; CHECK-LABEL: middle.block:
+; CHECK: %ind.escape = getelementptr i32, i32* %ptr, i64 124
+; CHECK-LABEL: for.end:
+; CHECK: %[[RET:.*]] = phi i32* [ {{.*}}, %for.body ], [ %ind.escape, %middle.block ]
+; CHECK: ret i32* %[[RET]]
+define i32* @geppre(i32* %ptr) {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %ptr.phi = phi i32* [ %ptr, %entry ], [ %inc.ptr, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %inc.ptr = getelementptr i32, i32* %ptr.phi, i32 4
+  %cmp = icmp eq i32 %inc, 32
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32* %ptr.phi
+}
+
+; CHECK-LABEL: @both
+; CHECK-LABEL: middle.block:
+; CHECK: %[[END:.*]] = sub i64 %n.vec, 1
+; CHECK: %ind.escape = getelementptr i32, i32* %base, i64 %[[END]]
+; CHECK-LABEL: for.end:
+; CHECK: %[[RET:.*]] = phi i32* [ %inc.lag1, %for.body ], [ %ind.escape, %middle.block ]
+; CHECK: ret i32* %[[RET]]
+
+define i32* @both(i32 %k)  {
+entry:
+  %base = getelementptr inbounds i32, i32* undef, i64 1
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc.lag1 = phi i32* [ %base, %entry ], [ %tmp, %for.body]
+  %inc.lag2 = phi i32* [ undef, %entry ], [ %inc.lag1, %for.body]  
+  %tmp = getelementptr inbounds i32, i32* %inc.lag1, i64 1    
+  %inc = add nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32* %inc.lag1
+}
+
+; CHECK-LABEL: @multiphi
+; CHECK-LABEL: scalar.ph:
+; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ]
+; CHECK-LABEL: for.end:
+; CHECK: %phi = phi i32 [ {{.*}}, %for.body ], [ %n.vec, %middle.block ]
+; CHECK: %phi2 = phi i32 [ {{.*}}, %for.body ], [ %n.vec, %middle.block ]
+; CHECK: store i32 %phi2, i32* %p
+; CHECK: ret i32 %phi
+define i32 @multiphi(i32 %k, i32* %p)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  %phi = phi i32 [ %inc, %for.body ]
+  %phi2 = phi i32 [ %inc, %for.body ]
+  store i32 %phi2, i32* %p
+  ret i32 %phi
+}
+
+; CHECK-LABEL: @PR30742
+; CHECK:   %[[T15:.+]] = add nsw i32 %tmp03, -7
+; CHECK: vector.ph
+; CHECK:   %[[N_MOD_VF:.+]] = urem i32 %[[T5:.+]], 2
+; CHECK:   %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]]
+; CHECK: middle.block
+; CHECK:   %[[CMP:.+]] = icmp eq i32 %[[T5]], %[[N_VEC]]
+; CHECK:   %ind.escape = add i32 %[[T15]],
+; CHECK:   br i1 %[[CMP]], label %BB3, label %scalar.ph
+define void @PR30742() {
+BB0:
+  br label %BB1
+
+BB1:
+  %tmp00 = load i32, i32* undef, align 16
+  %tmp01 = sub i32 %tmp00, undef
+  %tmp02 = icmp slt i32 %tmp01, 1
+  %tmp03 = select i1 %tmp02, i32 1, i32 %tmp01
+  %tmp04 = add nsw i32 %tmp03, -7
+  br label %BB2
+
+BB2:
+  %tmp05 = phi i32 [ %tmp04, %BB1 ], [ %tmp06, %BB2 ]
+  %tmp06 = add i32 %tmp05, -8
+  %tmp07 = icmp sgt i32 %tmp06, 0
+  br i1 %tmp07, label %BB2, label %BB3
+
+BB3:
+  %tmp08 = phi i32 [ %tmp05, %BB2 ]
+  %tmp09 = sub i32 %tmp00, undef
+  %tmp10 = icmp slt i32 %tmp09, 1
+  %tmp11 = select i1 %tmp10, i32 1, i32 %tmp09
+  %tmp12 = add nsw i32 %tmp11, -7
+  br label %BB4
+
+BB4:
+  %tmp13 = phi i32 [ %tmp12, %BB3 ], [ %tmp14, %BB4 ]
+  %tmp14 = add i32 %tmp13, -8
+  %tmp15 = icmp sgt i32 %tmp14, 0
+  br i1 %tmp15, label %BB4, label %BB1
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/lcssa-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/lcssa-crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/lcssa-crash.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/lcssa-crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,62 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%type1 = type { %type2 }
+%type2 = type { [0 x i8*], i8**, i32, i32, i32 }
+
+define void @test() nounwind uwtable align 2 {
+  br label %for.body.lr.ph.i.i.i
+
+for.body.lr.ph.i.i.i:
+  br label %for.body.i.i.i
+
+for.body.i.i.i:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc.i.i.i ], [ 0, %for.body.lr.ph.i.i.i ]
+  br label %for.inc.i.i.i
+
+for.inc.i.i.i:
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, undef
+  br i1 %exitcond, label %for.body.i.i.i, label %for.end.i.i.i
+
+for.end.i.i.i:
+  %lcssa = phi %type1* [ undef, %for.inc.i.i.i ]
+  unreachable
+}
+
+; PR16139
+define void @test2(i8* %x) {
+entry:
+  indirectbr i8* %x, [ label %L0, label %L1 ]
+
+L0:
+  br label %L0
+
+L1:
+  ret void
+}
+
+; This loop has different uniform instructions before and after LCSSA.
+define void @test3() {
+entry:
+  %add41 = add i32 undef, undef
+  %idxprom4736 = zext i32 %add41 to i64
+  br label %while.body
+
+while.body:
+  %idxprom4738 = phi i64 [ %idxprom47, %while.body ], [ %idxprom4736, %entry ]
+  %pos.337 = phi i32 [ %inc46, %while.body ], [ %add41, %entry ]
+  %inc46 = add i32 %pos.337, 1
+  %arrayidx48 = getelementptr inbounds [1024 x i8], [1024 x i8]* undef, i64 0, i64 %idxprom4738
+  store i8 0, i8* %arrayidx48, align 1
+  %and43 = and i32 %inc46, 3
+  %cmp44 = icmp eq i32 %and43, 0
+  %idxprom47 = zext i32 %inc46 to i64
+  br i1 %cmp44, label %while.end, label %while.body
+
+while.end:
+  %add58 = add i32 %inc46, 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/legal_preheader_check.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/legal_preheader_check.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/legal_preheader_check.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/legal_preheader_check.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -debug -S -o /dev/null 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; D40973
+; Make sure LV legal bails out when the loop doesn't have a legal pre-header.
+
+; CHECK: LV: Loop doesn't have a legal pre-header.
+
+define void @inc(i32 %n, i8* %P) {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %BB1, label %BB2
+
+BB1:
+  indirectbr i8* %P, [label %.lr.ph]
+
+BB2:
+  br label %.lr.ph
+
+.lr.ph:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %BB1 ], [ 0, %BB2 ]
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/libcall-remark.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/libcall-remark.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/libcall-remark.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/libcall-remark.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,52 @@
+; RUN: opt -S -loop-vectorize < %s 2>&1 -pass-remarks-analysis=.* | FileCheck %s
+
+; Test the optimization remark emitter for recognition 
+; of a mathlib function vs. an arbitrary function.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+ at data = external local_unnamed_addr global [32768 x float], align 16
+
+; CHECK: loop not vectorized: library call cannot be vectorized
+
+define void @libcall_blocks_vectorization() {
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [32768 x float], [32768 x float]* @data, i64 0, i64 %indvars.iv
+  %t0 = load float, float* %arrayidx, align 4
+  %sqrtf = tail call float @sqrtf(float %t0)
+  store float %sqrtf, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 32768
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK: loop not vectorized: call instruction cannot be vectorized
+
+define void @arbitrary_call_blocks_vectorization() {
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [32768 x float], [32768 x float]* @data, i64 0, i64 %indvars.iv
+  %t0 = load float, float* %arrayidx, align 4
+  %sqrtf = tail call float @arbitrary(float %t0)
+  store float %sqrtf, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 32768
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare float @sqrtf(float)
+declare float @arbitrary(float)
+

Added: llvm/trunk/test/Transforms/LoopVectorize/lifetime.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/lifetime.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/lifetime.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/lifetime.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,96 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we can vectorize loops which contain lifetime markers.
+
+; CHECK-LABEL: @test(
+; CHECK: call void @llvm.lifetime.end
+; CHECK: store <2 x i32>
+; CHECK: call void @llvm.lifetime.start
+
+define void @test(i32 *%d) {
+entry:
+  %arr = alloca [1024 x i32], align 16
+  %0 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
+  %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 8
+  store i32 100, i32* %arrayidx, align 8
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
+  ret void
+}
+
+; CHECK-LABEL: @testbitcast(
+; CHECK: call void @llvm.lifetime.end
+; CHECK: store <2 x i32>
+; CHECK: call void @llvm.lifetime.start
+
+define void @testbitcast(i32 *%d) {
+entry:
+  %arr = alloca [1024 x i32], align 16
+  %0 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %1 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1
+  %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx, align 8
+  store i32 100, i32* %arrayidx, align 8
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
+  ret void
+}
+
+; CHECK-LABEL: @testloopvariant(
+; CHECK: call void @llvm.lifetime.end
+; CHECK: store <2 x i32>
+; CHECK: call void @llvm.lifetime.start
+
+define void @testloopvariant(i32 *%d) {
+entry:
+  %arr = alloca [1024 x i32], align 16
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = getelementptr [1024 x i32], [1024 x i32]* %arr, i32 0, i64 %indvars.iv
+  %1 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1
+  %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx, align 8
+  store i32 100, i32* %arrayidx, align 8
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1

Added: llvm/trunk/test/Transforms/LoopVectorize/loop-form.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/loop-form.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/loop-form.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/loop-form.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that we vectorize only bottom-tested loops.
+; This is a reduced testcase from PR21302.
+;
+; rdar://problem/18886083
+
+%struct.X = type { i32, i16 }
+; CHECK-LABEL: @foo(
+; CHECK-NOT: vector.body
+
+define void @foo(i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %for.body, label %if.end
+
+for.body:
+  %iprom = sext i32 %i to i64
+  %b = getelementptr inbounds %struct.X, %struct.X* undef, i64 %iprom, i32 1
+  store i16 0, i16* %b, align 4
+  %inc = add nsw i32 %i, 1
+  br label %for.cond
+
+if.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/loop-scalars.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/loop-scalars.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/loop-scalars.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/loop-scalars.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,143 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: vector_gep
+; CHECK-NOT:   LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>*
+; CHECK-NEXT:    store <2 x i32*> [[TMP1]], <2 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @vector_gep(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: scalar_store
+; CHECK:       LV: Found scalar instruction: %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]]
+; CHECK-NEXT:    store i32* [[TMP5]], i32** [[TMP7]], align 8
+; CHECK-NEXT:    store i32* [[TMP6]], i32** [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @scalar_store(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: expansion
+; CHECK:       LV: Found scalar instruction: %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp1 = bitcast i64* %tmp0 to i32*
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32** [[TMP7]] to i64**
+; CHECK-NEXT:    store i64* [[TMP5]], i64** [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32** [[TMP8]] to i64**
+; CHECK-NEXT:    store i64* [[TMP6]], i64** [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @expansion(i32** %a, i64 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
+  %tmp1 = bitcast i64* %tmp0 to i32*
+  %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0
+  %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i
+  store i32* %tmp1, i32** %tmp3, align 8
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: no_gep_or_bitcast
+; CHECK-NOT:   LV: Found scalar instruction: %tmp1 = load i32*, i32** %tmp0, align 8
+; CHECK:       LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 1
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    store i32 0, i32* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    store i32 0, i32* [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @no_gep_or_bitcast(i32** noalias %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32*, i32** %a, i64 %i
+  %tmp1 = load i32*, i32** %tmp0, align 8
+  store i32 0, i32* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/loop-vect-memdep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/loop-vect-memdep.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/loop-vect-memdep.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/loop-vect-memdep.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,26 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; RUN: opt < %s -S -loop-vectorize -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+; CHECK: LV: Can't vectorize due to memory conflicts
+
+define void @test_loop_novect(double** %arr, i64 %n) {
+for.body.lr.ph:
+  %t = load double*, double** %arr, align 8
+  br label %for.body
+
+for.body:                                      ; preds = %for.body, %for.body.lr.ph
+  %i = phi i64 [ 0, %for.body.lr.ph ], [ %i.next, %for.body ]
+  %a = getelementptr inbounds double, double* %t, i64 %i
+  %i.next = add nuw nsw i64 %i, 1
+  %a.next = getelementptr inbounds double, double* %t, i64 %i.next
+  %t1 = load double, double* %a, align 8
+  %t2 = load double, double* %a.next, align 8
+  store double %t1, double* %a.next, align 8
+  store double %t2, double* %a, align 8
+  %c = icmp eq i64 %i, %n
+  br i1 %c, label %final, label %for.body
+
+final:                                   ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/memdep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/memdep.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/memdep.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/memdep.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,273 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s -check-prefix=WIDTH
+; RUN: opt -S -loop-vectorize -force-vector-width=4 < %s | FileCheck %s -check-prefix=RIGHTVF
+; RUN: opt -S -loop-vectorize -force-vector-width=8 < %s | FileCheck %s -check-prefix=WRONGVF
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Vectorization with dependence checks.
+
+; No plausible dependence - can be vectorized.
+;  for (i = 0; i < 1024; ++i)
+;    A[i] = A[i + 1] + 1;
+
+; CHECK-LABEL: @f1_vec(
+; CHECK: <2 x i32>
+
+define void @f1_vec(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv.next
+  %0 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %0, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  store i32 %add1, i32* %arrayidx3, align 4
+  %exitcond = icmp ne i32 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Plausible dependence of distance 1 - can't be vectorized.
+;  for (i = 0; i < 1024; ++i)
+;    A[i+1] = A[i] + 1;
+
+; CHECK-LABEL: @f2_novec(
+; CHECK-NOT: <2 x i32>
+
+define void @f2_novec(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv.next
+  store i32 %add, i32* %arrayidx3, align 4
+  %exitcond = icmp ne i32 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Plausible dependence of distance 2 - can be vectorized with a width of 2.
+;  for (i = 0; i < 1024; ++i)
+;    A[i+2] = A[i] + 1;
+
+; CHECK-LABEL: @f3_vec_len(
+; CHECK: <2 x i32>
+
+; WIDTH: f3_vec_len
+; WIDTH-NOT: <4 x i32>
+
+define void @f3_vec_len(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.01 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
+  %add1 = add nsw i32 %i.01, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %idxprom2
+  store i32 %add, i32* %arrayidx3, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Plausible dependence of distance 1 - cannot be vectorized (without reordering
+; accesses).
+;   for (i = 0; i < 1024; ++i) {
+;     B[i] = A[i];
+;     A[i] = B[i + 1];
+;   }
+
+; CHECK-LABEL: @f5(
+; CHECK-NOT: <2 x i32>
+
+define void @f5(i32*  %A, i32* %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv.next
+  %1 = load i32, i32* %arrayidx4, align 4
+  store i32 %1, i32* %arrayidx, align 4
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Dependence through a phi node - must not vectorize.
+;   for (i = 0; i < 1024; ++i) {
+;     a[i+1] = tmp;
+;     tmp = a[i];
+;   }
+
+; CHECK-LABEL: @f6
+; CHECK-NOT: <2 x i32>
+
+define i32 @f6(i32* %a, i32 %tmp) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp.addr.08 = phi i32 [ %tmp, %entry ], [ %0, %for.body ]
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+  store i32 %tmp.addr.08, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx3, align 4
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret i32 undef
+}
+
+; Don't vectorize true loop carried dependencies that are not a multiple of the
+; vector width.
+; Example:
+;   for (int i = ...; ++i) {
+;     a[i] = a[i-3] + ...;
+; It is a bad idea to vectorize this loop because store-load forwarding will not
+; happen.
+;
+
+; CHECK-LABEL: @nostoreloadforward(
+; CHECK-NOT: <2 x i32>
+
+define void @nostoreloadforward(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = add nsw i64 %indvars.iv, -3
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = add nsw i64 %indvars.iv, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %2
+  %3 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %3, %1
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add3, i32* %arrayidx5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Example:
+;   for (int i = ...; ++i) {
+;     a[i] = b[i];
+;     c[i] = a[i-3] + ...;
+; It is a bad idea to vectorize this loop because store-load forwarding will not
+; happen.
+;
+
+; CHECK-LABEL: @nostoreloadforward2(
+; CHECK-NOT: <2 x i32>
+
+define void @nostoreloadforward2(i32* noalias %A, i32* noalias %B, i32* noalias %C) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %1 = add nsw i64 %indvars.iv, -3
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %1
+  %2 = load i32, i32* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  store i32 %2, i32* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+
+;Check the new calculation of the maximum safe distance in bits which can be vectorized.
+;The previous behavior did not take account that the stride was 2.
+;Therefore the maxVF was computed as 8 instead of 4, as the dependence distance here is 6 iterations, given by |N-(N-12)|/2.  
+
+;#define M 32
+;#define N 2 * M
+;unsigned int a [N];
+;void pr34283(){
+;	unsigned int j=0;
+;   for (j = 0; j < M - 6; ++j)
+;    {
+;        a[N - 2 * j] = 69;
+;        a[N - 12 - 2 * j] = 7;
+;    }
+;
+;}
+
+; RIGHTVF-LABEL: @pr34283
+; RIGHTVF: <4 x i64>
+
+; WRONGVF-LABLE: @pr34283
+; WRONGVF-NOT: <8 x i64>
+
+ at a = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @pr34283() local_unnamed_addr {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl i64 %indvars.iv, 1
+  %1 = sub nuw nsw i64 64, %0
+  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* @a, i64 0, i64 %1
+  store i32 69, i32* %arrayidx, align 8
+  %2 = sub nuw nsw i64 52, %0
+  %arrayidx4 = getelementptr inbounds [64 x i32], [64 x i32]* @a, i64 0, i64 %2
+  store i32 7, i32* %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 26
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/metadata-unroll.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/metadata-unroll.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/metadata-unroll.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/metadata-unroll.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK-LABEL: @inc(
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !0
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.interleave.count", i32 2}

Added: llvm/trunk/test/Transforms/LoopVectorize/metadata-width.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/metadata-width.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/metadata-width.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/metadata-width.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @test1(
+; CHECK: store <8 x i32>
+; CHECK: ret void
+define void @test1(i32* nocapture %a, i32 %n) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 8}

Added: llvm/trunk/test/Transforms/LoopVectorize/metadata.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/metadata.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/metadata.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/metadata.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,43 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind uwtable
+define i32 @test1(i32* nocapture %a, float* nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !tbaa !0
+  %conv = fptosi float %0 to i32
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4, !tbaa !4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+; CHECK-LABEL: @test1
+; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa ![[TFLT:[0-9]+]]
+; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa ![[TINT:[0-9]+]]
+; CHECK: ret i32 0
+
+; CHECK-DAG: ![[TFLT]] = !{![[TFLT1:[0-9]+]]
+; CHECK-DAG: ![[TFLT1]] = !{!"float"
+
+; CHECK-DAG: ![[TINT]] = !{![[TINT1:[0-9]+]]
+; CHECK-DAG: ![[TINT1]] = !{!"int"
+
+attributes #0 = { nounwind uwtable }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"float", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !2, i64 0}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/middle-block-dbg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/middle-block-dbg.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/middle-block-dbg.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/middle-block-dbg.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,110 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Confirm that the DebugLoc info for the instructions in the middle block of a
+; vectorized loop are correct. The Cmp and Br instructions should map to the
+; same source lines as the Cmp and Br of the scalar loop.
+
+; CHECK-LABEL: middle.block:
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq {{.*}}!dbg ![[DL:[0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]]{{.*}} !dbg ![[DL]]
+; CHECK: ![[DL]] = !DILocation(line: 6,
+
+; This IR can be generated by running:
+; clang -g -O2 -emit-llvm -S -mllvm -opt-bisect-limit=68 vec.cpp -o - | opt -loop-vectorize -force-vector-width=2 -S -o vec.ll
+;
+; Where vec.cpp contains:
+;
+; extern int x;
+; extern int y;
+; void a() {
+;     const int len = x;
+;     int b[len];
+;     for(int i = 0; i< len; ++i)
+;         b[i] = x;
+;
+;     y = b[x] + b[x-5];
+; }
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+
+@"?x@@3HA" = external dso_local local_unnamed_addr global i32, align 4
+@"?y@@3HA" = external dso_local local_unnamed_addr global i32, align 4
+
+define dso_local void @"?a@@YAXXZ"() local_unnamed_addr #0 !dbg !8 {
+entry:
+  %0 = load i32, i32* @"?x@@3HA", align 4, !dbg !23, !tbaa !24
+  %1 = zext i32 %0 to i64, !dbg !28
+  %vla = alloca i32, i64 %1, align 16, !dbg !28
+  %cmp10 = icmp sgt i32 %0, 0, !dbg !30
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup, !dbg !30
+
+for.body.preheader:
+  br label %for.body, !dbg !31
+
+for.cond.cleanup.loopexit:
+  %idxprom1.phi.trans.insert = sext i32 %0 to i64
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %vla, i64 %idxprom1.phi.trans.insert
+  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4, !dbg !33, !tbaa !24
+  br label %for.cond.cleanup, !dbg !33
+
+for.cond.cleanup:
+  %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ undef, %entry ], !dbg !33
+  %sub = add nsw i32 %0, -5, !dbg !33
+  %idxprom3 = sext i32 %sub to i64, !dbg !33
+  %arrayidx4 = getelementptr inbounds i32, i32* %vla, i64 %idxprom3, !dbg !33
+  %3 = load i32, i32* %arrayidx4, align 4, !dbg !33, !tbaa !24
+  %add = add nsw i32 %3, %2, !dbg !33
+  store i32 %add, i32* @"?y@@3HA", align 4, !dbg !33, !tbaa !24
+  ret void, !dbg !34
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %vla, i64 %indvars.iv, !dbg !31
+  store i32 %0, i32* %arrayidx, align 4, !dbg !31, !tbaa !24
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !35
+  %exitcond = icmp eq i64 %indvars.iv.next, %1, !dbg !30
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !30, !llvm.loop !36
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 9.0.0 (https://github.com/llvm/llvm-project.git 045b8544fd2c4e14f7e72e0df2bc681d823b0838)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "vec.cpp", directory: "C:\5CUsers\5Cgbhyamso\5Cdev\5Cllvm\5Csamples", checksumkind: CSK_MD5, checksum: "fed997f50117f5514a69caf1c2fb2c49")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 9.0.0 (https://github.com/llvm/llvm-project.git 045b8544fd2c4e14f7e72e0df2bc681d823b0838)"}
+!8 = distinct !DISubprogram(name: "a", linkageName: "?a@@YAXXZ", scope: !1, file: !1, line: 3, type: !9, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null}
+!11 = !{!12, !15, !17, !21}
+!12 = !DILocalVariable(name: "len", scope: !8, file: !1, line: 4, type: !13)
+!13 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !14)
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!15 = !DILocalVariable(name: "__vla_expr0", scope: !8, type: !16, flags: DIFlagArtificial)
+!16 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!17 = !DILocalVariable(name: "b", scope: !8, file: !1, line: 5, type: !18)
+!18 = !DICompositeType(tag: DW_TAG_array_type, baseType: !14, elements: !19)
+!19 = !{!20}
+!20 = !DISubrange(count: !15)
+!21 = !DILocalVariable(name: "i", scope: !22, file: !1, line: 6, type: !14)
+!22 = distinct !DILexicalBlock(scope: !8, file: !1, line: 6)
+!23 = !DILocation(line: 4, scope: !8)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 5, scope: !8)
+!29 = !DILocation(line: 0, scope: !8)
+!30 = !DILocation(line: 6, scope: !22)
+!31 = !DILocation(line: 7, scope: !32)
+!32 = distinct !DILexicalBlock(scope: !22, file: !1, line: 6)
+!33 = !DILocation(line: 9, scope: !8)
+!34 = !DILocation(line: 10, scope: !8)
+!35 = !DILocation(line: 6, scope: !32)
+!36 = distinct !{!36, !30, !37}
+!37 = !DILocation(line: 7, scope: !22)

Added: llvm/trunk/test/Transforms/LoopVectorize/miniters.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/miniters.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/miniters.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/miniters.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,44 @@
+; RUN: opt %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; RUN: opt %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s -check-prefix=UNROLL
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+ at b = common global [1000 x i32] zeroinitializer, align 16
+ at c = common global [1000 x i32] zeroinitializer, align 16
+ at a = common global [1000 x i32] zeroinitializer, align 16
+
+; Generate min.iters.check to skip the vector loop and jump to scalar.ph directly when loop iteration number is less than VF * UF.
+; CHECK-LABEL: foo(
+; CHECK: %min.iters.check = icmp ult i64 %N, 4
+; CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+; UNROLL-LABEL: foo(
+; UNROLL: %min.iters.check = icmp ult i64 %N, 8
+; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+
+define void @foo(i64 %N) {
+entry:
+  %cmp.8 = icmp sgt i64 %N, 0
+  br i1 %cmp.8, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %i.09 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* @b, i64 0, i64 %i.09
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds [1000 x i32], [1000 x i32]* @c, i64 0, i64 %i.09
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx2 = getelementptr inbounds [1000 x i32], [1000 x i32]* @a, i64 0, i64 %i.09
+  store i32 %add, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i64 %i.09, 1
+  %exitcond = icmp eq i64 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/minmax_reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/minmax_reduction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/minmax_reduction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/minmax_reduction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,885 @@
+; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-interleave=1  < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at A = common global [1024 x i32] zeroinitializer, align 16
+ at fA = common global [1024 x float] zeroinitializer, align 16
+ at dA = common global [1024 x double] zeroinitializer, align 16
+
+; Signed tests.
+
+; Turn this into a max reduction. Make sure we use a splat to initialize the
+; vector for the reduction.
+; CHECK-LABEL: @max_red(
+; CHECK: %[[VAR:.*]] = insertelement <2 x i32> undef, i32 %max, i32 0
+; CHECK: {{.*}} = shufflevector <2 x i32> %[[VAR]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @max_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp sgt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a max reduction. The select has its inputs reversed therefore
+; this is a max reduction.
+; CHECK-LABEL: @max_red_inverse_select(
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @max_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp slt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction.
+; CHECK-LABEL: @min_red(
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp slt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction. The select has its inputs reversed therefore
+; this is a min reduction.
+; CHECK-LABEL: @min_red_inverse_select(
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @min_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp sgt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Unsigned tests.
+
+; Turn this into a max reduction.
+; CHECK-LABEL: @umax_red(
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umax_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp ugt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a max reduction. The select has its inputs reversed therefore
+; this is a max reduction.
+; CHECK-LABEL: @umax_red_inverse_select(
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umax_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp ult i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction.
+; CHECK-LABEL: @umin_red(
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umin_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp ult i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction. The select has its inputs reversed therefore
+; this is a min reduction.
+; CHECK-LABEL: @umin_red_inverse_select(
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umin_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp ugt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; SGE -> SLT
+; Turn this into a min reduction (select inputs are reversed).
+; CHECK-LABEL: @sge_min_red(
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @sge_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp sge i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; SLE -> SGT
+; Turn this into a max reduction (select inputs are reversed).
+; CHECK-LABEL: @sle_min_red(
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @sle_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp sle i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; UGE -> ULT
+; Turn this into a min reduction (select inputs are reversed).
+; CHECK-LABEL: @uge_min_red(
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @uge_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp uge i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; ULE -> UGT
+; Turn this into a max reduction (select inputs are reversed).
+; CHECK-LABEL: @ule_min_red(
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @ule_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp ule i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; No reduction.
+; CHECK-LABEL: @no_red_1(
+; CHECK-NOT: icmp <2 x i32>
+define i32 @no_red_1(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 1, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load i32, i32* %arrayidx1, align 4
+  %cmp3 = icmp sgt i32 %0, %1
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; CHECK-LABEL: @no_red_2(
+; CHECK-NOT: icmp <2 x i32>
+define i32 @no_red_2(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 1, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load i32, i32* %arrayidx1, align 4
+  %cmp3 = icmp sgt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Float tests.
+
+; Maximum.
+
+; Turn this into a max reduction in the presence of a no-nans-fp-math attribute.
+; CHECK-LABEL: @max_red_float(
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ogt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @max_red_float_ge(
+; CHECK: fcmp fast oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @max_red_float_ge(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast oge float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @inverted_max_red_float(
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast olt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @inverted_max_red_float_le(
+; CHECK: fcmp fast ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_max_red_float_le(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ole float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @unordered_max_red_float(
+; CHECK: fcmp fast ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ugt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @unordered_max_red_float_ge(
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_max_red_float_ge(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast uge float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @inverted_unordered_max_red_float(
+; CHECK: fcmp fast oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ult float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @inverted_unordered_max_red_float_le(
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_max_red_float_le(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ule float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; Minimum.
+
+; Turn this into a min reduction in the presence of a no-nans-fp-math attribute.
+; CHECK-LABEL: @min_red_float(
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast olt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @min_red_float_le(
+; CHECK: fcmp fast ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @min_red_float_le(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ole float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @inverted_min_red_float(
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ogt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @inverted_min_red_float_ge(
+; CHECK: fcmp fast oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_min_red_float_ge(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast oge float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @unordered_min_red_float(
+; CHECK: fcmp fast oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ult float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @unordered_min_red_float_le(
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_min_red_float_le(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ule float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @inverted_unordered_min_red_float(
+; CHECK: fcmp fast ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ugt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @inverted_unordered_min_red_float_ge(
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_min_red_float_ge(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast uge float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; Make sure we handle doubles, too.
+; CHECK-LABEL: @min_red_double(
+; CHECK: fcmp fast olt <2 x double>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x double>
+; CHECK: select <2 x i1>
+
+define double @min_red_double(double %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi double [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x double], [1024 x double]* @dA, i64 0, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp3 = fcmp fast olt double %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, double %0, double %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret double %min.red.0
+}
+
+
+; Don't this into a max reduction. The no-nans-fp-math attribute is missing
+; CHECK-LABEL: @max_red_float_nans(
+; CHECK-NOT: <2 x float>
+
+define float @max_red_float_nans(float %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ogt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+
+attributes #0 = { "no-nans-fp-math"="true" }

Added: llvm/trunk/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,41 @@
+; RUN: opt -indvars -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; We must not vectorize this loop. %add55 is not reduction. Its value is used
+; multiple times.
+
+; PR18526
+
+; CHECK: multiple_use_of_value
+; CHECK-NOT: <2 x i32>
+
+define void @multiple_use_of_value() {
+entry:
+  %n = alloca i32, align 4
+  %k7 = alloca i32, align 4
+  %nf = alloca i32, align 4
+  %0 = load i32, i32* %k7, align 4
+  %.neg1 = sub i32 0, %0
+  %n.promoted = load i32, i32* %n, align 4
+  %nf.promoted = load i32, i32* %nf, align 4
+  br label %for.body
+
+for.body:
+  %inc107 = phi i32 [ undef, %entry ], [ %inc10, %for.body ]
+  %inc6 = phi i32 [ %nf.promoted, %entry ], [ undef, %for.body ]
+  %add55 = phi i32 [ %n.promoted, %entry ], [ %add5, %for.body ]
+  %.neg2 = sub i32 0, %inc6
+  %add.neg = add i32 0, %add55
+  %add4.neg = add i32 %add.neg, %.neg1
+  %sub = add i32 %add4.neg, %.neg2
+  %add5 = add i32 %sub, %add55
+  %inc10 = add i32 %inc107, 1
+  %cmp = icmp ult i32 %inc10, 61
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  store i32 %add5.lcssa, i32* %n, align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/multiple-address-spaces.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/multiple-address-spaces.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/multiple-address-spaces.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/multiple-address-spaces.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,43 @@
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; From a simple program with two address spaces:
+; char Y[4*10000] __attribute__((address_space(1)));
+; char X[4*10000];
+; int main() {
+;    for (int i = 0; i < 4*10000; ++i)
+;        X[i] = Y[i] + 1;
+;    return 0;
+;}
+
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at Y = common addrspace(1) global [40000 x i8] zeroinitializer, align 16
+ at X = common global [40000 x i8] zeroinitializer, align 16
+
+;CHECK-LABEL: @main(
+;CHECK: bitcast i8 addrspace(1)* %{{.*}} to <4 x i8> addrspace(1)*
+;CHECK: bitcast i8* %{{.*}} to <4 x i8>*
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [40000 x i8], [40000 x i8] addrspace(1)* @Y, i64 0, i64 %indvars.iv
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %add = add i8 %0, 1
+  %arrayidx3 = getelementptr inbounds [40000 x i8], [40000 x i8]* @X, i64 0, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx3, align 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 40000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }

Added: llvm/trunk/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,64 @@
+; RUN: opt -loop-vectorize -force-vector-width=4 -S < %s | FileCheck %s
+
+; This is the test case from PR26314.
+; When we were retrying dependence checking with memchecks only,
+; the loop-invariant access in the inner loop was incorrectly determined to be wrapping
+; because it was not strided in the inner loop.
+; Improved wrapping detection allows vectorization in the following case.
+
+; #define Z 32
+; typedef struct s {
+;       int v1[Z];
+;       int v2[Z];
+;       int v3[Z][Z];
+; } s;
+;
+; void slow_function (s* const obj, int z) {
+;    for (int j=0; j<Z; j++) {
+;        for (int k=0; k<z; k++) {
+;            int x = obj->v1[k] + obj->v2[j];
+;            obj->v3[j][k] += x;
+;        }
+;    }
+; }
+
+; CHECK-LABEL: Test
+; CHECK: <4 x i64>
+; CHECK: <4 x i32>, <4 x i32>
+; CHECK: !{!"llvm.loop.isvectorized", i32 1}
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] }
+
+define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
+  br label %.outer.preheader
+
+
+.outer.preheader:
+  %i = phi i64 [ 0, %0 ], [ %i.next, %.outer ]
+  %1 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 1, i64 %i
+  br label %.inner
+
+.exit:
+  ret void
+ 
+.outer:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond.outer = icmp eq i64 %i.next, 32
+  br i1 %exitcond.outer, label %.exit, label %.outer.preheader
+
+.inner:
+  %j = phi i64 [ 0, %.outer.preheader ], [ %j.next, %.inner ]
+  %2 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 0, i64 %j
+  %3 = load i32, i32* %2
+  %4 = load i32, i32* %1
+  %5 = add nsw i32 %4, %3
+  %6 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 2, i64 %i, i64 %j
+  %7 = load i32, i32* %6
+  %8 = add nsw i32 %5, %7
+  store i32 %8, i32* %6  
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond.inner = icmp eq i64 %j.next, %z
+  br i1 %exitcond.inner, label %.outer, label %.inner
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/no-interleave-up-front.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/no-interleave-up-front.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/no-interleave-up-front.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/no-interleave-up-front.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test case based on reproducer for
+; http://bugs.chromium.org/p/oss-fuzz/issues/detail?id=6477
+
+define void @test1(i32 %n) #0 {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    br i1 false, label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[DOTLR_PH]], !llvm.loop !0
+; CHECK:       ._crit_edge.loopexit:
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret void
+;
+  br i1 false, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %.lr.ph, %0
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br i1 true, label %._crit_edge, label %.lr.ph, !llvm.loop !0
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.interleave.count", i32 2}

Added: llvm/trunk/test/Transforms/LoopVectorize/no_array_bounds.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/no_array_bounds.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/no_array_bounds.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/no_array_bounds.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,100 @@
+; RUN: opt < %s -loop-vectorize -transform-warning -S 2>&1 | FileCheck %s
+
+; Verify warning is generated when vectorization/ interleaving is explicitly specified and fails to occur.
+; CHECK: warning: no_array_bounds.cpp:5:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+; CHECK: warning: no_array_bounds.cpp:10:5: loop not interleaved: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+;  #pragma clang loop vectorize(enable)
+;  for (int i = 0; i < number; i++) {
+;    A[B[i]]++;
+;  }
+
+;  #pragma clang loop vectorize(disable) interleave(enable)
+;  for (int i = 0; i < number; i++) {
+;    B[A[i]]++;
+;  }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z4testPiS_i(i32* nocapture %A, i32* nocapture %B, i32 %number) #0 !dbg !4 {
+entry:
+  %cmp25 = icmp sgt i32 %number, 0, !dbg !10
+  br i1 %cmp25, label %for.body.preheader, label %for.end15, !dbg !10, !llvm.loop !12
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !14
+
+for.cond5.preheader:                              ; preds = %for.body
+  br i1 %cmp25, label %for.body7.preheader, label %for.end15, !dbg !16, !llvm.loop !18
+
+for.body7.preheader:                              ; preds = %for.cond5.preheader
+  br label %for.body7, !dbg !20
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv27 = phi i64 [ %indvars.iv.next28, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv27, !dbg !14
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !14, !tbaa !22
+  %idxprom1 = sext i32 %0 to i64, !dbg !14
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !14
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !14, !tbaa !22
+  %inc = add nsw i32 %1, 1, !dbg !14
+  store i32 %inc, i32* %arrayidx2, align 4, !dbg !14, !tbaa !22
+  %indvars.iv.next28 = add nuw nsw i64 %indvars.iv27, 1, !dbg !10
+  %lftr.wideiv29 = trunc i64 %indvars.iv.next28 to i32, !dbg !10
+  %exitcond30 = icmp eq i32 %lftr.wideiv29, %number, !dbg !10
+  br i1 %exitcond30, label %for.cond5.preheader, label %for.body, !dbg !10, !llvm.loop !12
+
+for.body7:                                        ; preds = %for.body7.preheader, %for.body7
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body7 ], [ 0, %for.body7.preheader ]
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !20
+  %2 = load i32, i32* %arrayidx9, align 4, !dbg !20, !tbaa !22
+  %idxprom10 = sext i32 %2 to i64, !dbg !20
+  %arrayidx11 = getelementptr inbounds i32, i32* %B, i64 %idxprom10, !dbg !20
+  %3 = load i32, i32* %arrayidx11, align 4, !dbg !20, !tbaa !22
+  %inc12 = add nsw i32 %3, 1, !dbg !20
+  store i32 %inc12, i32* %arrayidx11, align 4, !dbg !20, !tbaa !22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !16
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !16
+  %exitcond = icmp eq i32 %lftr.wideiv, %number, !dbg !16
+  br i1 %exitcond, label %for.end15.loopexit, label %for.body7, !dbg !16, !llvm.loop !18
+
+for.end15.loopexit:                               ; preds = %for.body7
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry, %for.cond5.preheader
+  ret void, !dbg !26
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "no_array_bounds.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "no_array_bounds.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0"}
+!10 = !DILocation(line: 4, column: 8, scope: !11)
+!11 = distinct !DILexicalBlock(line: 4, column: 3, file: !1, scope: !4)
+!12 = !{!12, !13}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+!14 = !DILocation(line: 5, column: 5, scope: !15)
+!15 = distinct !DILexicalBlock(line: 4, column: 36, file: !1, scope: !11)
+!16 = !DILocation(line: 9, column: 8, scope: !17)
+!17 = distinct !DILexicalBlock(line: 9, column: 3, file: !1, scope: !4)
+!18 = !{!18, !13, !19}
+!19 = !{!"llvm.loop.vectorize.width", i32 1}
+!20 = !DILocation(line: 10, column: 5, scope: !21)
+!21 = distinct !DILexicalBlock(line: 9, column: 36, file: !1, scope: !17)
+!22 = !{!23, !23, i64 0}
+!23 = !{!"int", !24, i64 0}
+!24 = !{!"omnipotent char", !25, i64 0}
+!25 = !{!"Simple C/C++ TBAA"}
+!26 = !DILocation(line: 12, column: 1, scope: !4)

Added: llvm/trunk/test/Transforms/LoopVectorize/no_idiv_reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/no_idiv_reduction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/no_idiv_reduction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/no_idiv_reduction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
+ at a = common global [128 x i32] zeroinitializer, align 16
+
+;; Must not vectorize division reduction. Division is lossy.
+define i32 @g() {
+entry:
+  br label %for.body
+
+for.body:
+  ; CHECK-LABEL: @g(
+  ; CHECK-NOT: sdiv <2 x i32>
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.05 = phi i32 [ 80, %entry ], [ %div, %for.body ]
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @a, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %div = sdiv i32 %r.05, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %div
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/no_int_induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/no_int_induction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/no_int_induction.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/no_int_induction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,60 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; int __attribute__((noinline)) sum_array(int *A, int n) {
+;  return std::accumulate(A, A + n, 0);
+; }
+
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
+
+;CHECK-LABEL: @sum_array(
+;CHECK: phi i64
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: ret i32
+define i32 @sum_array(i32* %A, i32 %n) nounwind uwtable readonly noinline ssp {
+  %1 = sext i32 %n to i64
+  %2 = getelementptr inbounds i32, i32* %A, i64 %1
+  %3 = icmp eq i32 %n, 0
+  br i1 %3, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %.lr.ph.i
+
+.lr.ph.i:                                         ; preds = %0, %.lr.ph.i
+  %.03.i = phi i32* [ %6, %.lr.ph.i ], [ %A, %0 ]
+  %.012.i = phi i32 [ %5, %.lr.ph.i ], [ 0, %0 ]
+  %4 = load i32, i32* %.03.i, align 4
+  %5 = add nsw i32 %4, %.012.i
+  %6 = getelementptr inbounds i32, i32* %.03.i, i64 1
+  %7 = icmp eq i32* %6, %2
+  br i1 %7, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %.lr.ph.i
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %.lr.ph.i, %0
+  %.01.lcssa.i = phi i32 [ 0, %0 ], [ %5, %.lr.ph.i ]
+  ret i32 %.01.lcssa.i
+}
+
+; Same, but use a pointer with a different size.
+;CHECK-LABEL: @sum_array_as1(
+;CHECK: phi i16
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: ret i32
+define i32 @sum_array_as1(i32 addrspace(1)* %A, i32 %n) nounwind uwtable readonly noinline ssp {
+  %1 = sext i32 %n to i64
+  %2 = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %1
+  %3 = icmp eq i32 %n, 0
+  br i1 %3, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %.lr.ph.i
+
+.lr.ph.i:                                         ; preds = %0, %.lr.ph.i
+  %.03.i = phi i32 addrspace(1)* [ %6, %.lr.ph.i ], [ %A, %0 ]
+  %.012.i = phi i32 [ %5, %.lr.ph.i ], [ 0, %0 ]
+  %4 = load i32, i32 addrspace(1)* %.03.i, align 4
+  %5 = add nsw i32 %4, %.012.i
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %.03.i, i64 1
+  %7 = icmp eq i32 addrspace(1)* %6, %2
+  br i1 %7, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %.lr.ph.i
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %.lr.ph.i, %0
+  %.01.lcssa.i = phi i32 [ 0, %0 ], [ %5, %.lr.ph.i ]
+  ret i32 %.01.lcssa.i
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/no_outside_user.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/no_outside_user.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/no_outside_user.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/no_outside_user.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,414 @@
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+
+ at f = common global i32 0, align 4
+ at .str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+ at c = common global i32 0, align 4
+ at a = common global i32 0, align 4
+ at b = common global i32 0, align 4
+ at e = common global i32 0, align 4
+
+; It has a value that is used outside of the loop
+; and is not a recognized reduction variable "tmp17".
+; However, tmp17 is a non-header phi which is an allowed exit.
+
+; CHECK-LABEL: @test1(
+; CHECK: %vec.ind = phi <2 x i32>
+; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i32> %vec.ind, <i32 10, i32 10>
+; CHECK: %predphi = select <2 x i1> [[CMP]], <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi, i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
+
+define i32 @test1()  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ]
+  ret i32 %.lcssa
+}
+
+; non-hdr phi depends on header phi.
+; CHECK-LABEL: @test2(
+; CHECK: %vec.ind = phi <2 x i32>
+; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i32> %vec.ind, <i32 10, i32 10>
+; CHECK: %predphi = select <2 x i1> [[CMP]], <2 x i32> <i32 1, i32 1>, <2 x i32> %vec.ind
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi, i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
+define i32 @test2()  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ %tmp8, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ]
+  ret i32 %.lcssa
+}
+
+; more than 2 incoming values for tmp17 phi that is used outside loop.
+; CHECK-LABEL: test3(
+; CHECK-LABEL: vector.body:
+; CHECK:          %predphi = select <2 x i1> %{{.*}}, <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
+; CHECK:          %predphi1 = select <2 x i1> %{{.*}}, <2 x i32> <i32 2, i32 2>, <2 x i32> %predphi
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi1, i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
+define i32 @test3(i32 %N)  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  %cmp = icmp sgt i32 %tmp8, %N
+  br i1  %cmp, label %bb12, label %bb16
+
+bb12:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ], [ 2, %bb12 ]
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ]
+  ret i32 %.lcssa
+}
+
+; more than one incoming value for outside user: %.lcssa
+; CHECK-LABEL: test4(
+; CHECK-LABEL: vector.body:
+; CHECK:          %predphi = select <2 x i1>
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi, i32 1
+
+; CHECK-LABEL: f1.exit.loopexit.loopexit:
+; CHECK:          %tmp17.lcssa = phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
+; CHECK-NEXT:     br label %f1.exit.loopexit
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i32 [ 2, %bb ], [ %tmp17.lcssa, %f1.exit.loopexit.loopexit ]
+define i32 @test4(i32 %N)  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  %icmp = icmp slt i32 %b.promoted, %N
+  br i1 %icmp, label %f1.exit.loopexit, label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ], [ 2, %bb ]
+  ret i32 %.lcssa
+}
+
+; non hdr phi that depends on reduction and is used outside the loop.
+; reduction phis are only allowed to have bump or reduction operations as the inside user, so we should
+; not vectorize this.
+; CHECK-LABEL: reduction_sum(
+; CHECK-NOT: <2 x i32>
+define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+entry:
+  %c1 = icmp sgt i32 %n, 0
+  br i1 %c1, label %header, label %._crit_edge
+
+header:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb16 ], [ 0, %entry ]
+  %sum.02 = phi i32 [ %c9, %bb16 ], [ 0, %entry ]
+  %c2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %c3 = load i32, i32* %c2, align 4
+  %c4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %c5 = load i32, i32* %c4, align 4
+  %tmp2 = icmp sgt i32 %sum.02, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ %sum.02, %bb10 ], [ 1, %header ]
+  %c6 = trunc i64 %indvars.iv to i32
+  %c7 = add i32 %sum.02, %c6
+  %c8 = add i32 %c7, %c3
+  %c9 = add i32 %c8, %c5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %header
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %c9, %bb16 ]
+  %nonhdr.lcssa = phi i32 [ 1, %entry], [ %tmp17, %bb16 ]
+  ret i32 %sum.0.lcssa
+}
+
+; invalid cyclic dependency with header phi iv, which prevents iv from being
+; recognized as induction var.
+; cannot vectorize.
+; CHECK-LABEL: cyclic_dep_with_indvar(
+; CHECK-NOT: <2 x i32>
+define i32 @cyclic_dep_with_indvar()  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %iv = phi i32 [ %ivnext, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %iv, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ %iv, %.lr.ph.i ]
+  %ivnext = add nsw i32 %tmp17, 1
+  %tmp19 = icmp slt i32 %ivnext, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ]
+  ret i32 %.lcssa
+}
+
+; non-reduction phi 'tmp17' used outside loop has cyclic dependence with %x.05 phi
+; cannot vectorize.
+; CHECK-LABEL: not_valid_reduction(
+; CHECK-NOT: <2 x i32>
+define i32 @not_valid_reduction(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
+  %x.05 = phi i32 [ %tmp17, %latch ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp0 = load i32, i32* %arrayidx, align 4
+  %tmp2 = icmp sgt i64 %indvars.iv, 10
+  %sub = sub nsw i32 %x.05, %tmp0
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 1, %bb10 ], [ %sub, %for.body ]
+  br label %latch
+
+latch:
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ 0, %entry ], [ %tmp17 , %latch ]
+  ret i32 %x.0.lcssa
+}
+
+
+; CHECK-LABEL: @outside_user_non_phi(
+; CHECK: %vec.ind = phi <2 x i32>
+; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i32> %vec.ind, <i32 10, i32 10>
+; CHECK: %predphi = select <2 x i1> [[CMP]], <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
+; CHECK: [[TRUNC:%[a-zA-Z0-9.]+]] = trunc <2 x i32> %predphi to <2 x i8>
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i8> [[TRUNC]], i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i8 [ %tmp17.trunc, %bb16 ], [ [[E1]], %middle.block ]
+define i8 @outside_user_non_phi()  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp17.trunc = trunc i32 %tmp17 to i8
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i8 [ %tmp17.trunc, %bb16 ]
+  ret i8 %.lcssa
+}
+
+; CHECK-LABEL: no_vectorize_reduction_with_outside_use(
+; CHECK-NOT: <2 x i32>
+define i32 @no_vectorize_reduction_with_outside_use(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %1, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+
+; vectorize c[i] = a[i] + b[i] loop where result of c[i] is used outside the
+; loop
+; CHECK-LABEL: sum_arrays_outside_use(
+; CHECK-LABEL: vector.memcheck:
+; CHECK:         br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph  
+
+; CHECK-LABEL: vector.body:
+; CHECK:          %wide.load = load <2 x i32>, <2 x i32>*
+; CHECK:          %wide.load16 = load <2 x i32>, <2 x i32>* 
+; CHECK:          [[ADD:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> %wide.load, %wide.load16
+; CHECK:          store <2 x i32>
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i32 [ %sum, %.lr.ph.i ], [ [[E1]], %middle.block ]
+define i32 @sum_arrays_outside_use(i32* %B, i32* %A, i32* %C, i32 %N)  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %iv = phi i32 [ %ivnext, %.lr.ph.i ], [ %b.promoted, %bb ]
+  %indvars.iv = sext i32 %iv to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %Bload = load i32, i32* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %Aload = load i32, i32* %arrayidx, align 4
+  %sum = add nsw i32 %Bload, %Aload
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  store i32 %sum, i32* %arrayidx3, align 4
+  %ivnext = add nsw i32 %iv, 1
+  %tmp19 = icmp slt i32 %ivnext, %N
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %sum, %.lr.ph.i ]
+  ret i32 %.lcssa
+}
+
+ at tab = common global [32 x i8] zeroinitializer, align 1
+
+; CHECK-LABEL: non_uniform_live_out()
+; CHECK-LABEL:   vector.body:
+; CHECK:           %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:           [[ADD:%[a-zA-Z0-9.]+]] = add <2 x i32> %vec.ind, <i32 7, i32 7> 
+; CHECK:           [[EE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 0 
+; CHECK:           [[GEP:%[a-zA-Z0-9.]+]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[EE]]
+; CHECK-NEXT:      [[GEP2:%[a-zA-Z0-9.]+]] = getelementptr inbounds i8, i8* [[GEP]], i32 0
+; CHECK-NEXT:      [[BC:%[a-zA-Z0-9.]+]] = bitcast i8* [[GEP2]] to <2 x i8>*
+; CHECK-NEXT:      %wide.load = load <2 x i8>, <2 x i8>* [[BC]]
+; CHECK-NEXT:      [[ADD2:%[a-zA-Z0-9.]+]] = add <2 x i8> %wide.load, <i8 1, i8 1> 
+; CHECK:           store <2 x i8> [[ADD2]], <2 x i8>*
+
+; CHECK-LABEL:  middle.block:
+; CHECK:           [[ADDEE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 1
+
+; CHECK-LABEL:  for.end:
+; CHECK:           %lcssa = phi i32 [ %i.09, %for.body ], [ [[ADDEE]], %middle.block ]
+; CHECK:           %arrayidx.out = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %lcssa
+define i32 @non_uniform_live_out() {
+entry:
+ br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %i.09 = add i32 %i.08, 7
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.09
+ %0 = load i8, i8* %arrayidx, align 1
+ %bump = add i8 %0, 1
+ store i8 %bump, i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %i.08, 20000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+ %lcssa = phi i32 [%i.09, %for.body]
+ %arrayidx.out = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %lcssa
+ store i8 42, i8* %arrayidx.out, align 1
+ ret i32 0
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/no_switch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/no_switch.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/no_switch.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/no_switch.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,93 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -transform-warning -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -transform-warning -S 2>&1 | FileCheck %s -check-prefix=NOANALYSIS
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -transform-warning -pass-remarks-missed='loop-vectorize' -S 2>&1 | FileCheck %s -check-prefix=MOREINFO
+
+; CHECK: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement
+; CHECK: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; NOANALYSIS-NOT: remark: {{.*}}
+; NOANALYSIS: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; MOREINFO: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement
+; MOREINFO: remark: source.cpp:4:5: loop not vectorized (Force=true, Vector Width=4)
+; MOREINFO: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; CHECK: _Z11test_switchPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z11test_switchPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 {
+entry:
+  %cmp18 = icmp sgt i32 %Length, 0, !dbg !10
+  br i1 %cmp18, label %for.body.preheader, label %for.end, !dbg !10, !llvm.loop !12
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !14
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !14
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !14, !tbaa !16
+  switch i32 %0, label %for.inc [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb3
+  ], !dbg !14
+
+sw.bb:                                            ; preds = %for.body
+  %1 = trunc i64 %indvars.iv to i32, !dbg !20
+  %mul = shl nsw i32 %1, 1, !dbg !20
+  br label %for.inc, !dbg !22
+
+sw.bb3:                                           ; preds = %for.body
+  %2 = trunc i64 %indvars.iv to i32, !dbg !23
+  store i32 %2, i32* %arrayidx, align 4, !dbg !23, !tbaa !16
+  br label %for.inc, !dbg !23
+
+for.inc:                                          ; preds = %sw.bb3, %for.body, %sw.bb
+  %storemerge = phi i32 [ %mul, %sw.bb ], [ 0, %for.body ], [ 0, %sw.bb3 ]
+  store i32 %storemerge, i32* %arrayidx, align 4, !dbg !20, !tbaa !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !10
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !10, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !24
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test_switch", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0"}
+!10 = !DILocation(line: 3, column: 8, scope: !11)
+!11 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!12 = !{!12, !13, !13}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+!14 = !DILocation(line: 4, column: 5, scope: !15)
+!15 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !11)
+!16 = !{!17, !17, i64 0}
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C/C++ TBAA"}
+!20 = !DILocation(line: 6, column: 7, scope: !21)
+!21 = distinct !DILexicalBlock(line: 4, column: 18, file: !1, scope: !15)
+!22 = !DILocation(line: 7, column: 5, scope: !21)
+!23 = !DILocation(line: 9, column: 7, scope: !21)
+!24 = !DILocation(line: 14, column: 1, scope: !4)

Added: llvm/trunk/test/Transforms/LoopVectorize/no_switch_disable_vectorization.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/no_switch_disable_vectorization.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/no_switch_disable_vectorization.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/no_switch_disable_vectorization.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,95 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -transform-warning -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -transform-warning -S 2>&1 | FileCheck %s -check-prefix=NOANALYSIS
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -transform-warning -pass-remarks-missed='loop-vectorize' -S 2>&1 | FileCheck %s -check-prefix=MOREINFO
+
+; This test is a copy of no_switch.ll, with the "llvm.loop.vectorize.enable" metadata set to false.
+; It tests that vectorization is explicitly disabled and no warnings are emitted.
+
+; CHECK-NOT: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement
+; CHECK-NOT: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; NOANALYSIS-NOT: remark: {{.*}}
+; NOANALYSIS-NOT: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; MOREINFO: remark: source.cpp:4:5: loop not vectorized: vectorization is explicitly disabled
+; MOREINFO-NOT: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; CHECK: _Z11test_switchPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z11test_switchPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 {
+entry:
+  %cmp18 = icmp sgt i32 %Length, 0, !dbg !10
+  br i1 %cmp18, label %for.body.preheader, label %for.end, !dbg !10, !llvm.loop !12
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !14
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !14
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !14, !tbaa !16
+  switch i32 %0, label %for.inc [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb3
+  ], !dbg !14
+
+sw.bb:                                            ; preds = %for.body
+  %1 = trunc i64 %indvars.iv to i32, !dbg !20
+  %mul = shl nsw i32 %1, 1, !dbg !20
+  br label %for.inc, !dbg !22
+
+sw.bb3:                                           ; preds = %for.body
+  %2 = trunc i64 %indvars.iv to i32, !dbg !23
+  store i32 %2, i32* %arrayidx, align 4, !dbg !23, !tbaa !16
+  br label %for.inc, !dbg !23
+
+for.inc:                                          ; preds = %sw.bb3, %for.body, %sw.bb
+  %storemerge = phi i32 [ %mul, %sw.bb ], [ 0, %for.body ], [ 0, %sw.bb3 ]
+  store i32 %storemerge, i32* %arrayidx, align 4, !dbg !20, !tbaa !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !10
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !10, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !24
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test_switch", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0"}
+!10 = !DILocation(line: 3, column: 8, scope: !11)
+!11 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!12 = !{!12, !13, !13}
+!13 = !{!"llvm.loop.vectorize.enable", i1 false}
+!14 = !DILocation(line: 4, column: 5, scope: !15)
+!15 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !11)
+!16 = !{!17, !17, i64 0}
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C/C++ TBAA"}
+!20 = !DILocation(line: 6, column: 7, scope: !21)
+!21 = distinct !DILexicalBlock(line: 4, column: 18, file: !1, scope: !15)
+!22 = !DILocation(line: 7, column: 5, scope: !21)
+!23 = !DILocation(line: 9, column: 7, scope: !21)
+!24 = !DILocation(line: 14, column: 1, scope: !4)

Added: llvm/trunk/test/Transforms/LoopVectorize/noalias-md-licm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/noalias-md-licm.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/noalias-md-licm.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/noalias-md-licm.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,59 @@
+; RUN: opt -basicaa -scoped-noalias -loop-vectorize -licm -force-vector-width=2 \
+; RUN:     -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; In order to vectorize the inner loop, it needs to be versioned with
+; memchecks between {A} x {B, C} first:
+;
+;   for (i = 0; i < n; i++)
+;     for (j = 0; j < m; j++)
+;         A[j] += B[i] + C[j];
+;
+; Since in the versioned vector loop A and B can no longer alias, B[i] can be
+; LICM'ed from the inner loop.
+
+
+define void @f(i32* %a, i32* %b, i32* %c) {
+entry:
+  br label %outer
+
+outer:
+  %i.2 = phi i64 [ 0, %entry ], [ %i, %inner.end ]
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %i.2
+  br label %inner.ph
+
+inner.ph:
+; CHECK: vector.ph:
+; CHECK: load i32, i32* %arrayidxB,
+; CHECK: br label %vector.body
+  br label %inner
+
+inner:
+  %j.2 = phi i64 [ 0, %inner.ph ], [ %j, %inner ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %j.2
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %j.2
+  %loadC = load i32, i32* %arrayidxC, align 4
+
+  %add = add nuw i32 %loadA, %loadB
+  %add2 = add nuw i32 %add, %loadC
+
+  store i32 %add2, i32* %arrayidxA, align 4
+
+  %j = add nuw nsw i64 %j.2, 1
+  %cond1 = icmp eq i64 %j, 20
+  br i1 %cond1, label %inner.end, label %inner
+
+inner.end:
+  %i = add nuw nsw i64 %i.2, 1
+  %cond2 = icmp eq i64 %i, 30
+  br i1 %cond2, label %outer.end, label %outer
+
+outer.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/noalias-md.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/noalias-md.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/noalias-md.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/noalias-md.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,78 @@
+; RUN: opt -basicaa -loop-vectorize -force-vector-width=2 \
+; RUN:     -force-vector-interleave=1 -S < %s \
+; RUN:     | FileCheck %s -check-prefix=BOTH -check-prefix=LV
+; RUN: opt -basicaa -scoped-noalias -loop-vectorize -dse -force-vector-width=2 \
+; RUN:     -force-vector-interleave=1 -S < %s \
+; RUN:     | FileCheck %s -check-prefix=BOTH -check-prefix=DSE
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; This loop needs to be versioned with memchecks between {A, B} x {C} before
+; it can be vectorized.
+;
+;   for (i = 0; i < n; i++) {
+;     C[i] = A[i] + 1;
+;     C[i] += B[i];
+;   }
+;
+; Check that the corresponding noalias metadata is added to the vector loop
+; but not to the scalar loop.
+;
+; Since in the versioned vector loop C and B can no longer alias, the first
+; store to C[i] can be DSE'd.
+
+
+define void @f(i32* %a, i32* %b, i32* %c) {
+entry:
+  br label %for.body
+
+; BOTH: vector.memcheck:
+; BOTH: vector.body:
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+; Scope 1
+; LV: = load {{.*}} !alias.scope !0
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %add = add nuw i32 %loadA, 2
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+; Noalias with scope 1 and 6
+; LV: store {{.*}} !alias.scope !3, !noalias !5
+; DSE-NOT: store
+  store i32 %add, i32* %arrayidxC, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+; Scope 6
+; LV: = load {{.*}} !alias.scope !7
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %add2 = add nuw i32 %add, %loadB
+
+; Noalias with scope 1 and 6
+; LV: store {{.*}} !alias.scope !3, !noalias !5
+; DSE: store
+  store i32 %add2, i32* %arrayidxC, align 4
+
+  %inc = add nuw nsw i64 %ind, 1
+  %exitcond = icmp eq i64 %inc, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+; BOTH: for.body:
+; BOTH-NOT: !alias.scope
+; BOTH-NOT: !noalias
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; LV: !0 = !{!1}
+; LV: !1 = distinct !{!1, !2}
+; LV: !2 = distinct !{!2, !"LVerDomain"}
+; LV: !3 = !{!4}
+; LV: !4 = distinct !{!4, !2}
+; LV: !5 = !{!1, !6}
+; LV: !6 = distinct !{!6, !2}
+; LV: !7 = !{!6}

Added: llvm/trunk/test/Transforms/LoopVectorize/nofloat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/nofloat.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/nofloat.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/nofloat.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; Make sure that we don't vectorize functions with 'noimplicitfloat' attributes.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @example12(
+;CHECK-NOT: store <4 x i32>
+;CHECK: ret void
+define void @example12() noimplicitfloat { ;           <--------- "noimplicitfloat" attribute here!
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/non-const-n.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/non-const-n.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/non-const-n.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/non-const-n.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at b = common global [2048 x i32] zeroinitializer, align 16
+ at c = common global [2048 x i32] zeroinitializer, align 16
+ at a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @example1(
+;CHECK: shl i32
+;CHECK: zext i32
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i32 %n) nounwind uwtable ssp {
+  %n4 = shl i32 %n, 2
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n4
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/nontemporal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/nontemporal.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/nontemporal.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/nontemporal.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @foo(
+define void @foo(float* noalias %a, float* noalias %b, float* noalias %c, i32 %N) {
+entry:
+  %cmp.4 = icmp sgt i32 %N, 0
+  br i1 %cmp.4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+
+; Check that we don't lose !nontemporal hint when vectorizing loads.
+; CHECK: %wide.load{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !nontemporal !0
+
+; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it.
+; CHECK: %wide.load{{[0-9]+}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}}
+  %arrayidx2 = getelementptr inbounds float, float* %c, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd float %0, %1
+
+; Check that we don't lose !nontemporal hint when vectorizing stores.
+; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+  %arrayidx4 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4, !nontemporal !0
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+; CHECK: ret void
+  ret void
+}
+
+!0 = !{i32 1}

Added: llvm/trunk/test/Transforms/LoopVectorize/nsw-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/nsw-crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/nsw-crash.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/nsw-crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define void @test() {
+entry:
+  br i1 undef, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:
+  br label %while.body
+
+while.body:
+  %it.sroa.0.091 = phi i32* [ undef, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ]
+  %incdec.ptr.i = getelementptr inbounds i32, i32* %it.sroa.0.091, i64 1
+  %inc32 = add i32 undef, 1                                        ; <------------- Make sure we don't set NSW flags to the undef.
+  %cmp.i11 = icmp eq i32* %incdec.ptr.i, undef
+  br i1 %cmp.i11, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+

Added: llvm/trunk/test/Transforms/LoopVectorize/opt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/opt.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/opt.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/opt.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt -S -O3 -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck --check-prefix=LOOPVEC %s
+; RUN: opt -S -O3 -disable-loop-vectorization -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck --check-prefix=NOLOOPVEC %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we can disable vectorization in opt.
+
+; LOOPVEC:       add <2 x i32>
+; NOLOOPVEC-NOT: add <2 x i32>
+
+define i32 @vect(i32* %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %red.05
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/optsize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/optsize.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/optsize.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/optsize.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,102 @@
+; This test verifies that the loop vectorizer will NOT produce a tail
+; loop with the optimize for size or the minimize size attributes.
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -pgso -S | FileCheck %s -check-prefix=PGSO
+; RUN: opt < %s -loop-vectorize -pgso=false -S | FileCheck %s -check-prefix=NPGSO
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+ at tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_optsize() #0 {
+; CHECK-LABEL: @foo_optsize(
+; CHECK-NOT: <2 x i8>
+; CHECK-NOT: <4 x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { optsize }
+
+define i32 @foo_minsize() #1 {
+; CHECK-LABEL: @foo_minsize(
+; CHECK-NOT: <2 x i8>
+; CHECK-NOT: <4 x i8>
+; CHECK-LABEL: @foo_pgso(
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #1 = { minsize }
+
+define i32 @foo_pgso() !prof !14 {
+; PGSO-LABEL: @foo_pgso(
+; PGSO-NOT: <{{[0-9]+}} x i8>
+; NPGSO-LABEL: @foo_pgso(
+; NPGSO: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

Added: llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,82 @@
+; extern int arr[8][8];
+; extern int arr2[8];
+;
+; void foo(int n)
+; {
+;   int i1, i2;
+;
+; #pragma clang loop vectorize(enable) vectorize_width(4)
+;   for (i1 = 0; i1 < 8; i1++) {
+;     arr2[i1] = i1;
+;     for (i2 = 0; i2 < 8; i2++)
+;       arr[i2][i1] = i1 + n;
+;   }
+; }
+;
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0
+; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
+; CHECK: br label %[[InnerLoop:.+]]
+
+; CHECK: [[InnerLoop]]:
+; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
+; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
+; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+ at arr2 = external global [8 x i32], align 16
+ at arr = external global [8 x [8 x i32]], align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21
+  %0 = trunc i64 %indvars.iv21 to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %1 = trunc i64 %indvars.iv21 to i32
+  %add = add nsw i32 %1, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i32 %add, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}

Added: llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,112 @@
+; int A[1024], B[1024];
+;
+; void foo(int iCount, int c, int jCount)
+; {
+;
+;   int i, j;
+;
+; #pragma clang loop vectorize(enable) vectorize_width(4)
+;   for (i = 0; i < iCount; i++) {
+;     A[i] = c;
+;     for (j = 0; j < jCount; j++) {
+;       A[i] += B[j] + i;
+;     }
+;   }
+; }
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
+; CHECK: %[[ZeroTripChk:.*]] = icmp sgt i32 %jCount, 0
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[CVal0:.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+; CHECK-NEXT: %[[CSplat:.*]] = shufflevector <4 x i32> %[[CVal0]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: %[[ZVal0:.*]] = insertelement <4 x i1> undef, i1 %[[ZeroTripChk]], i32 0
+; CHECK-NEXT: %[[ZSplat:.*]] = shufflevector <4 x i1> %[[ZVal0]], <4 x i1> undef, <4 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[CSplat]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[ZCmpExtr:.*]] = extractelement <4 x i1> %[[ZSplat]], i32 0
+; CHECK: br i1 %[[ZCmpExtr]], label %[[InnerForPh:.*]], label %[[OuterInc:.*]]
+
+; CHECK: [[InnerForPh]]:
+; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: br label %[[InnerForBody:.*]]
+
+; CHECK: [[InnerForBody]]:
+; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ %[[InnerIndNext:.*]], %[[InnerForBody]] ], [ zeroinitializer, %[[InnerForPh]] ]
+; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ], [ %[[WideAVal]], %[[InnerForPh]] ]
+; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, <4 x i64> %[[InnerInd]]
+; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[BAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]]
+; CHECK: %[[AccumPhiNext]] = add nsw <4 x i32> %[[Add1]], %[[AccumPhi]]
+; CHECK: %[[InnerIndNext]] = add nuw nsw <4 x i64> %[[InnerInd]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK: %[[InnerVecCond:.*]] = icmp eq <4 x i64> %[[InnerIndNext]], {{.*}}
+; CHECK: %[[InnerCond:.+]] = extractelement <4 x i1> %[[InnerVecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[InnerCrit:.*]], label %[[InnerForBody]]
+
+; CHECK: [[InnerCrit]]:
+; CHECK: %[[StorePhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext]], %[[InnerForBody]] ]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StorePhi]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK:  br label %[[ForInc]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
+; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], {{.*}}
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+ at A = common global [1024 x i32] zeroinitializer, align 16
+ at B = common global [1024 x i32] zeroinitializer, align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %iCount, i32 %c, i32 %jCount) {
+entry:
+  %cmp22 = icmp sgt i32 %iCount, 0
+  br i1 %cmp22, label %for.body.lr.ph, label %for.end11
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp220 = icmp sgt i32 %jCount, 0
+  %wide.trip.count = zext i32 %jCount to i64
+  %wide.trip.count27 = zext i32 %iCount to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc9, %for.body.lr.ph
+  %indvars.iv25 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next26, %for.inc9 ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv25
+  store i32 %c, i32* %arrayidx, align 4
+  br i1 %cmp220, label %for.body3.lr.ph, label %for.inc9
+
+for.body3.lr.ph:                                  ; preds = %for.body
+  %arrayidx.promoted = load i32, i32* %arrayidx, align 4
+  %0 = trunc i64 %indvars.iv25 to i32
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %1 = phi i32 [ %arrayidx.promoted, %for.body3.lr.ph ], [ %add8, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %0
+  %add8 = add nsw i32 %add, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.inc9_crit_edge, label %for.body3
+
+for.cond1.for.inc9_crit_edge:                     ; preds = %for.body3
+  store i32 %add8, i32* %arrayidx, align 4
+  br label %for.inc9
+
+for.inc9:                                         ; preds = %for.cond1.for.inc9_crit_edge, %for.body
+  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+  %exitcond28 = icmp eq i64 %indvars.iv.next26, %wide.trip.count27
+  br i1 %exitcond28, label %for.end11, label %for.body, !llvm.loop !1
+
+for.end11:                                        ; preds = %for.inc9, %entry
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}

Added: llvm/trunk/test/Transforms/LoopVectorize/partial-lcssa.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/partial-lcssa.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/partial-lcssa.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/partial-lcssa.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,54 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; We vectorize the inner loop, so we have to put it in LCSSA form.
+; However, there's no reason to touch the outer loop.
+
+; CHECK-LABEL: @foo
+; CHECK-LABEL: for.end.inner.loopexit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i64 [ %indvars.iv, %for.body.inner ], [ %{{.*}}, %middle.block ]
+; CHECK: store i64 %[[LCSSAPHI]], i64* %O1, align 4
+; CHECK-LABEL: for.end.outer.loopexit
+; CHECK: store i64 %indvars.outer, i64* %O2, align 4
+
+
+define i64 @foo(i32* nocapture %A, i32* nocapture %B, i64 %n, i64 %m, i64* %O1, i64* %O2) {
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
+
+for.body.outer.preheader:                         ; preds = %entry
+  br label %for.body.outer
+
+for.body.outer:                                   ; preds = %for.body.outer.preheader, %for.end.inner
+  %indvars.outer = phi i64 [ %indvars.outer.next, %for.end.inner ], [ 0, %for.body.outer.preheader ]
+  %cmp2 = icmp sgt i64 %m, 0
+  br i1 %cmp2, label %for.body.inner.preheader, label %for.end.inner
+
+for.body.inner.preheader:                         ; preds = %for.body.outer
+  br label %for.body.inner
+
+for.body.inner:                                   ; preds = %for.body.inner.preheader, %for.body.inner
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.inner ], [ 0, %for.body.inner.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %v = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %v, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, %n
+  br i1 %exitcond, label %for.end.inner.loopexit, label %for.body.inner
+
+for.end.inner.loopexit:                           ; preds = %for.body.inner
+  store i64 %indvars.iv, i64 *%O1, align 4
+  br label %for.end.inner
+
+for.end.inner:                                    ; preds = %for.end.inner.loopexit, %for.body.outer
+  %indvars.outer.next = add i64 %indvars.outer, 1
+  %exitcond.outer = icmp eq i64 %indvars.outer, %m
+  br i1 %exitcond.outer, label %for.end.outer.loopexit, label %for.body.outer
+
+for.end.outer.loopexit:                           ; preds = %for.end.inner
+  store i64 %indvars.outer, i64 *%O2, align 4
+  br label %for.end.outer
+
+for.end.outer:                                    ; preds = %for.end.outer.loopexit, %entry
+  ret i64 undef
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/phi-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/phi-cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/phi-cost.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/phi-cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,86 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: phi_two_incoming_values
+; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
+; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %tmp5 = phi i32 [ %tmp1, %for.body ], [ %tmp4, %if.then ]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK:         [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* {{.*}}
+; CHECK:         [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i32>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = add <2 x i32> [[WIDE_LOAD]], [[TMP6]]
+; CHECK:         store <2 x i32> [[PREDPHI]], <2 x i32>* {{.*}}
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+;
+define void @phi_two_incoming_values(i32* %a, i32* %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp3 = icmp sgt i32 %tmp1, 0
+  br i1 %tmp3, label %if.then, label %if.end
+
+if.then:
+  %tmp4 = add i32 %tmp1, 1
+  br label %if.end
+
+if.end:
+  %tmp5 = phi i32 [ %tmp1, %for.body ], [ %tmp4, %if.then ]
+  store i32 %tmp5, i32* %tmp2, align 4
+  %i.next = add i64 %i, 1
+  %cond = icmp eq i64 %i, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: phi_three_incoming_values
+; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
+; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %tmp8 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %tmp7, %if.else ]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK:         [[PREDPHI:%.*]] = select <2 x i1> {{.*}}, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 9, i32 9>
+; CHECK:         [[PREDPHI7:%.*]] = select <2 x i1> {{.*}}, <2 x i32> {{.*}}, <2 x i32> [[PREDPHI]]
+; CHECK:         store <2 x i32> [[PREDPHI7]], <2 x i32>* {{.*}}
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+;
+define void @phi_three_incoming_values(i32* %a, i32* %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp3 = load i32, i32* %tmp2, align 4
+  %tmp4 = icmp sgt i32 %tmp1, %tmp3
+  br i1 %tmp4, label %if.then, label %if.end
+
+if.then:
+  %tmp5 = icmp sgt i32 %tmp1, 19
+  br i1 %tmp5, label %if.end, label %if.else
+
+if.else:
+  %tmp6 = icmp slt i32 %tmp3, 4
+  %tmp7 = select i1 %tmp6, i32 4, i32 5
+  br label %if.end
+
+if.end:
+  %tmp8 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %tmp7, %if.else ]
+  store i32 %tmp8, i32* %tmp0, align 4
+  %i.next = add i64 %i, 1
+  %cond = icmp eq i64 %i, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/phi-hang.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/phi-hang.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/phi-hang.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/phi-hang.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,47 @@
+; RUN: opt -S -loop-vectorize < %s
+
+; PR15384
+define void @test1(i32 %arg) {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb5, %bb
+  %tmp = phi i32 [ 1, %bb ], [ %tmp7, %bb5 ]
+  %tmp2 = phi i32 [ %arg, %bb ], [ %tmp9, %bb5 ]
+  br i1 true, label %bb5, label %bb3
+
+bb3:                                              ; preds = %bb1
+  br label %bb4
+
+bb4:                                              ; preds = %bb3
+  br label %bb5
+
+bb5:                                              ; preds = %bb4, %bb1
+  %tmp6 = phi i32 [ 0, %bb4 ], [ %tmp, %bb1 ]
+  %tmp7 = phi i32 [ 0, %bb4 ], [ %tmp, %bb1 ]
+  %tmp8 = phi i32 [ 0, %bb4 ], [ %tmp, %bb1 ]
+  %tmp9 = add nsw i32 %tmp2, 1
+  %tmp10 = icmp eq i32 %tmp9, 0
+  br i1 %tmp10, label %bb11, label %bb1
+
+bb11:                                             ; preds = %bb5
+  ret void
+}
+
+; PR15748
+define void @test2() {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi i32 [ 0, %bb ], [ %tmp5, %bb1 ]
+  %tmp2 = phi i32 [ 0, %bb ], [ 1, %bb1 ]
+  %tmp3 = phi i32 [ 0, %bb ], [ %tmp4, %bb1 ]
+  %tmp4 = or i32 %tmp2, %tmp3
+  %tmp5 = add nsw i32 %tmp, 1
+  %tmp6 = icmp eq i32 %tmp5, 0
+  br i1 %tmp6, label %bb7, label %bb1
+
+bb7:                                              ; preds = %bb1
+  ret void
+}




More information about the llvm-commits mailing list