[llvm] r358552 - Revert "Temporarily Revert "Add basic loop fusion pass.""

Eric Christopher via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 16 21:53:01 PDT 2019


Added: llvm/trunk/test/Transforms/LoopVectorize/AArch64/pr36032.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AArch64/pr36032.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/pr36032.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/pr36032.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mtriple=aarch64-unknown-linux-gnu -force-vector-interleave=1 -force-vector-width=4 < %s | FileCheck %s
+
+; The test checks that there is no assert caused by issue described in PR36032
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+%struct.anon = type { i8 }
+
+ at c = local_unnamed_addr global [6 x i8] zeroinitializer, align 1
+ at b = internal global %struct.anon zeroinitializer, align 1
+
+; Function Attrs: noreturn nounwind
+define void @_Z1dv() local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z1dv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i8* @"_ZN3$_01aEv"(%struct.anon* nonnull @b)
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[CALL]], i64 4
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[F_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD5:%.*]], [[FOR_COND_CLEANUP:%.*]] ]
+; CHECK-NEXT:    [[G_0:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ]
+; CHECK-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[G_0]], 4
+; CHECK-NEXT:    [[CONV:%.*]] = and i32 [[F_0]], 65535
+; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[G_0]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 4, [[TMP0]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 3, [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[G_0]], [[CONV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP4]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP3]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP3]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 false, i1 [[TMP7]], i1 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[TMP2]], 4294967295
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or i1 false, [[TMP12]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8* [[CALL]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[G_0]], [[CONV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP15]], [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @c, i64 0, i64 4), i64 [[TMP16]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP2]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[OFFSET_IDX4:%.*]] = add i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP18]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION7:%.*]] = add <4 x i32> [[BROADCAST_SPLAT6]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1, !alias.scope !0
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP27]], align 1, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ], [ [[TMP0]], [[VECTOR_SCEVCHECK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[G_1_LCSSA]] = phi i32 [ [[G_0]], [[FOR_COND]] ], [ 4, [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[ADD5]] = add nuw nsw i32 [[CONV]], 4
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[CONV]], [[TMP29]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP30]], i8* [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7
+;
+entry:
+  %call = tail call i8* @"_ZN3$_01aEv"(%struct.anon* nonnull @b) #2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.cleanup, %entry
+  %f.0 = phi i32 [ 0, %entry ], [ %add5, %for.cond.cleanup ]
+  %g.0 = phi i32 [ undef, %entry ], [ %g.1.lcssa, %for.cond.cleanup ]
+  %cmp12 = icmp ult i32 %g.0, 4
+  %conv = and i32 %f.0, 65535
+  br i1 %cmp12, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %for.cond
+  %0 = zext i32 %g.0 to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %for.cond
+  %g.1.lcssa = phi i32 [ %g.0, %for.cond ], [ 4, %for.cond.cleanup.loopexit ]
+  %add5 = add nuw nsw i32 %conv, 4
+  br label %for.cond
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %1 = trunc i64 %indvars.iv to i32
+  %add = add i32 %conv, %1
+  %idxprom = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 %idxprom
+  %2 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %call, i64 %indvars.iv
+  store i8 %2, i8* %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i8* @"_ZN3$_01aEv"(%struct.anon*) local_unnamed_addr #1

Added: llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,231 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; Check predication-related cost calculations, including scalarization overhead
+; and block probability scaling. Note that the functionality being tested is
+; not specific to AArch64. We specify a target to get actual values for the
+; instruction costs.
+
+; CHECK-LABEL: predicated_udiv
+;
+; This test checks that we correctly compute the cost of the predicated udiv
+; instruction. If we assume the block probability is 50%, we compute the cost
+; as:
+;
+; Cost of udiv:
+;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+;
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+;
+define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp4 = udiv i32 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %tmp3, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
+
+; CHECK-LABEL: predicated_store
+;
+; This test checks that we correctly compute the cost of the predicated store
+; instruction. If we assume the block probability is 50%, we compute the cost
+; as:
+;
+; Cost of store:
+;   (store(4) + extractelement(3)) / 2 = 3
+;
+; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+;
+define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add nsw i32 %tmp1, %x
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  store i32 %tmp2, i32* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+;
+; This test checks that we correctly compute the cost of the predicated udiv
+; instruction and the add instruction it uses. The add is scalarized and sunk
+; inside the predicated block.  If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+;   (add(2) + extractelement(3)) / 2 = 2
+; Cost of udiv:
+;   (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4
+;
+; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+;
+define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp2 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp3 = add nsw i32 %tmp2, %x
+  %tmp4 = udiv i32 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
+
+; CHECK-LABEL: predicated_store_scalarized_operand
+;
+; This test checks that we correctly compute the cost of the predicated store
+; instruction and the add instruction it uses. The add is scalarized and sunk
+; inside the predicated block.  If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+;   (add(2) + extractelement(3)) / 2 = 2
+; Cost of store:
+;   store(4) / 2 = 2
+;
+; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+;
+define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp2 = add nsw i32 %tmp1, %x
+  store i32 %tmp2, i32* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: predication_multi_context
+;
+; This test checks that we correctly compute the cost of multiple predicated
+; instructions in the same block. The sdiv, udiv, and store must be scalarized
+; and predicated. The sub feeding the store is scalarized and sunk inside the
+; store's predicated block. However, the add feeding the sdiv and udiv cannot
+; be sunk and is not scalarized. If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+;   add(1) = 1
+; Cost of sdiv:
+;   (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+; Cost of udiv:
+;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+; Cost of sub:
+;   (sub(2) + extractelement(3)) / 2 = 2
+; Cost of store:
+;   store(4) / 2 = 2
+;
+; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
+; CHECK:     Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK:     Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
+; CHECK:     Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4
+; CHECK:     Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
+; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
+; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4
+;
+define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp2 = add i32 %tmp1, %x
+  %tmp3 = sdiv i32 %tmp1, %tmp2
+  %tmp4 = udiv i32 %tmp3, %tmp2
+  %tmp5 = sub i32 %tmp4, %x
+  store i32 %tmp5, i32* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,171 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: @reduction_i8
+;
+; char reduction_i8(char *a, char *b, int n) {
+;   char sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += (a[i] + b[i]);
+;   return sum;
+; }
+;
+; CHECK: vector.body:
+; CHECK:   phi <16 x i8>
+; CHECK:   load <16 x i8>
+; CHECK:   load <16 x i8>
+; CHECK:   add <16 x i8>
+; CHECK:   add <16 x i8>
+;
+; CHECK: middle.block:
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>
+; CHECK:   zext i8 [[Rdx]] to i32
+;
+define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp.12 = icmp sgt i32 %n, 0
+  br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  %conv6 = trunc i32 %add5.lcssa to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i8 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
+  ret i8 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %sum.013 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %conv4 = and i32 %sum.013, 255
+  %add = add nuw nsw i32 %conv, %conv4
+  %add5 = add nuw nsw i32 %add, %conv3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
+}
+
+; CHECK-LABEL: @reduction_i16_1
+;
+; short reduction_i16_1(short *a, short *b, int n) {
+;   short sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += (a[i] + b[i]);
+;   return sum;
+; }
+;
+; CHECK: vector.body:
+; CHECK:   phi <8 x i16>
+; CHECK:   load <8 x i16>
+; CHECK:   load <8 x i16>
+; CHECK:   add <8 x i16>
+; CHECK:   add <8 x i16>
+;
+; CHECK: middle.block:
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>
+; CHECK:   zext i16 [[Rdx]] to i32
+;
+define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp.16 = icmp sgt i32 %n, 0
+  br i1 %cmp.16, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  %conv6 = trunc i32 %add5.lcssa to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
+  ret i16 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %sum.017 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %a, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv.14 = zext i16 %0 to i32
+  %arrayidx2 = getelementptr inbounds i16, i16* %b, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx2, align 2
+  %conv3.15 = zext i16 %1 to i32
+  %conv4.13 = and i32 %sum.017, 65535
+  %add = add nuw nsw i32 %conv.14, %conv4.13
+  %add5 = add nuw nsw i32 %add, %conv3.15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
+}
+
+; CHECK-LABEL: @reduction_i16_2
+;
+; short reduction_i16_2(char *a, char *b, int n) {
+;   short sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += (a[i] + b[i]);
+;   return sum;
+; }
+;
+; CHECK: vector.body:
+; CHECK:   phi <8 x i16>
+; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
+; CHECK:   zext <8 x i8> [[Ld1]] to <8 x i16>
+; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
+; CHECK:   zext <8 x i8> [[Ld2]] to <8 x i16>
+; CHECK:   add <8 x i16>
+; CHECK:   add <8 x i16>
+;
+; CHECK: middle.block:
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>
+; CHECK:   zext i16 [[Rdx]] to i32
+;
+define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp.14 = icmp sgt i32 %n, 0
+  br i1 %cmp.14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  %conv6 = trunc i32 %add5.lcssa to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
+  ret i16 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %sum.015 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %conv4.13 = and i32 %sum.015, 65535
+  %add = add nuw nsw i32 %conv, %conv4.13
+  %add5 = add nuw nsw i32 %add, %conv3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%struct.anon = type { [100 x i32], i32, [100 x i32] }
+
+ at Foo = common global %struct.anon zeroinitializer, align 4
+
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x i32>, <4 x i32>*
+; CHECK: sdiv <4 x i32>
+; CHECK: store <4 x i32>
+
+define void @foo(){
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %div = sdiv i32 %0, 2
+  %arrayidx2 = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
+  store i32 %div, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: Checking a loop in "interleaved_access"
+; CHECK:         The Smallest and Widest types: 64 / 64 bits
+;
+define void @interleaved_access(i8** %A, i64 %N) {
+for.ph:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next.3, %for.body ], [ 0, %for.ph ]
+  %tmp0 = getelementptr inbounds i8*, i8** %A, i64 %i
+  store i8* null, i8** %tmp0, align 8
+  %i.next.0 = add nuw nsw i64 %i, 1
+  %tmp1 = getelementptr inbounds i8*, i8** %A, i64 %i.next.0
+  store i8* null, i8** %tmp1, align 8
+  %i.next.1 = add nsw i64 %i, 2
+  %tmp2 = getelementptr inbounds i8*, i8** %A, i64 %i.next.1
+  store i8* null, i8** %tmp2, align 8
+  %i.next.2 = add nsw i64 %i, 3
+  %tmp3 = getelementptr inbounds i8*, i8** %A, i64 %i.next.2
+  store i8* null, i8** %tmp3, align 8
+  %i.next.3 = add nsw i64 %i, 4
+  %cond = icmp slt i64 %i.next.3, %N
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,47 @@
+; RUN: opt -S < %s -loop-vectorize -force-vector-width=4 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: test0
+define void @test0(i16* noalias %M3) {
+entry:
+  br label %if.then1165.us
+
+if.then1165.us:                                   ; preds = %if.then1165.us, %entry
+  %indvars.iv1783 = phi i64 [ 0, %entry ], [ %indvars.iv.next1784, %if.then1165.us ]
+  %conv1177.us = zext i16 undef to i32
+  %add1178.us = add nsw i32 %conv1177.us, undef
+  %conv1179.us = trunc i32 %add1178.us to i16
+  %idxprom1181.us = ashr exact i64 undef, 32
+  %arrayidx1185.us = getelementptr inbounds i16, i16* %M3, i64 %idxprom1181.us
+  store i16 %conv1179.us, i16* %arrayidx1185.us, align 2
+  %indvars.iv.next1784 = add nuw nsw i64 %indvars.iv1783, 1
+  %exitcond1785 = icmp eq i64 %indvars.iv.next1784, 16
+  br i1 %exitcond1785, label %for.inc1286.loopexit, label %if.then1165.us
+
+for.inc1286.loopexit:                             ; preds = %if.then1165.us
+  ret void
+}
+
+; CHECK-LABEL: test1
+define void @test1(i16* noalias %M3) {
+entry:
+  br label %if.then1165.us
+
+if.then1165.us:                                   ; preds = %if.then1165.us, %entry
+  %indvars.iv1783 = phi i64 [ 0, %entry ], [ %indvars.iv.next1784, %if.then1165.us ]
+  %fptr = load i32, i32* undef, align 4
+  %conv1177.us = zext i16 undef to i32
+  %add1178.us = add nsw i32 %conv1177.us, %fptr
+  %conv1179.us = trunc i32 %add1178.us to i16
+  %idxprom1181.us = ashr exact i64 undef, 32
+  %arrayidx1185.us = getelementptr inbounds i16, i16* %M3, i64 %idxprom1181.us
+  store i16 %conv1179.us, i16* %arrayidx1185.us, align 2
+  %indvars.iv.next1784 = add nuw nsw i64 %indvars.iv1783, 1
+  %exitcond1785 = icmp eq i64 %indvars.iv.next1784, 16
+  br i1 %exitcond1785, label %for.inc1286.loopexit, label %if.then1165.us
+
+for.inc1286.loopexit:                             ; preds = %if.then1165.us
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -loop-vectorize -simplifycfg < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -loop-vectorize -pass-remarks-analysis='loop-vectorize' < %s 2>&1 | FileCheck -check-prefixes=REMARK %s
+
+; GCN-LABEL: @runtime_check_divergent_target(
+; GCN-NOT: load <2 x half>
+; GCN-NOT: store <2 x half>
+
+; REMARK: remark: <unknown>:0:0: loop not vectorized: runtime pointer checks needed. Not enabled for divergent target
+define amdgpu_kernel void @runtime_check_divergent_target(half addrspace(1)* nocapture %a, half addrspace(1)* nocapture %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds half, half addrspace(1)* %b, i64 %indvars.iv
+  %load = load half, half addrspace(1)* %arrayidx, align 4
+  %mul = fmul half %load, 3.0
+  %arrayidx2 = getelementptr inbounds half, half addrspace(1)* %a, i64 %indvars.iv
+  store half %mul, half addrspace(1)* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s  -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s  -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s  -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s
+
+; GCN-LABEL: @vectorize_v2f16_loop(
+; GFX9: vector.body:
+; GFX9: phi <2 x half>
+; GFX9: load <2 x half>
+; GFX9: fadd fast <2 x half>
+
+; GFX9: middle.block:
+; GFX9: fadd fast <2 x half>
+
+; VI: phi half
+; VI: phi load half
+; VI: fadd fast half
+define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %q.04 = phi half [ 0.0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds half, half addrspace(1)* %s, i64 %indvars.iv
+  %0 = load half, half addrspace(1)* %arrayidx, align 2
+  %add = fadd fast half %q.04, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  %add.lcssa = phi half [ %add, %for.body ]
+  ret half %add.lcssa
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -loop-vectorize < %s | FileCheck %s
+
+
+; For AMDGPU, loop unroll in loop vectorizer is disabled when VF==1.
+;
+; CHECK-LABEL: @small_loop(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: ret
+define amdgpu_kernel void @small_loop(i32* nocapture %inArray, i32 %size) nounwind {
+entry:
+  %0 = icmp sgt i32 %size, 0
+  br i1 %0, label %loop, label %exit
+
+loop:                                          ; preds = %entry, %loop
+  %iv = phi i32 [ %iv1, %loop ], [ 0, %entry ]
+  %1 = getelementptr inbounds i32, i32* %inArray, i32 %iv
+  %2 = load i32, i32* %1, align 4
+  %3 = add nsw i32 %2, 6
+  store i32 %3, i32* %1, align 4
+  %iv1 = add i32 %iv, 1
+;  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %cond = icmp eq i32 %iv1, %size
+  br i1 %cond, label %exit, label %loop
+
+exit:                                         ; preds = %loop, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,330 @@
+; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
+; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
+; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
+; REQUIRES: asserts
+
+; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
+; regarding IEEE 754 standard.
+; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
+; because NEON is not IEEE compliant.
+; Darwin, on the other hand, doesn't support subnormals, and all optimizations
+; are allowed, even without -ffast-math.
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi"
+; CHECK: We can vectorize this loop!
+define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX: Checking a loop in "sumf"
+; LINUX: Potentially unsafe FP op prevents vectorization
+; DARWIN: Checking a loop in "sumf"
+; DARWIN: We can vectorize this loop!
+define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+  store float %mul, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi"
+; CHECK: We can vectorize this loop!
+define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX: Checking a loop in "redf"
+; LINUX: Potentially unsafe FP op prevents vectorization
+; DARWIN: Checking a loop in "redf"
+; DARWIN: We can vectorize this loop!
+define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %add = fadd float %Red.06, %mul
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi float [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; LINUX: Checking a loop in "fabs"
+; LINUX: Potentially unsafe FP op prevents vectorization
+; DARWIN: Checking a loop in "fabs"
+; DARWIN: We can vectorize this loop!
+define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+  %1 = load float, float* %arrayidx1, align 4
+  %fabsf = tail call float @fabsf(float %1) #1
+  %conv3 = fmul float %0, %fabsf
+  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+  store float %conv3, float* %arrayidx4, align 4
+  %inc = add nuw nsw i32 %i.011, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi_fast"
+; CHECK: We can vectorize this loop!
+define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "sumf_fast"
+; CHECK: We can vectorize this loop!
+define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %1, %0
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+  store float %mul, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi_fast"
+; CHECK: We can vectorize this loop!
+define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "redf_fast"
+; CHECK: We can vectorize this loop!
+define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %1, %0
+  %add = fadd fast float %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi float [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; CHECK: Checking a loop in "fabs_fast"
+; CHECK: We can vectorize this loop!
+define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+  %1 = load float, float* %arrayidx1, align 4
+  %fabsf = tail call fast float @fabsf(float %1) #2
+  %conv3 = fmul fast float %fabsf, %0
+  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+  store float %conv3, float* %arrayidx4, align 4
+  %inc = add nuw nsw i32 %i.011, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @fabsf(float)
+
+attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-unroll.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-unroll.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-unroll.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,71 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT
+; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFTUNROLL
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK-LABEL: @foo(
+;CHECK: load <4 x i32>
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret
+;SWIFT-LABEL: @foo(
+;SWIFT: load <4 x i32>
+;SWIFT: load <4 x i32>
+;SWIFT: ret
+define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
+  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i32 %i.02
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, %sum.01
+  %5 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+; Verify the register limit. On arm we don't have 16 allocatable registers.
+;SWIFTUNROLL-LABEL: @register_limit(
+;SWIFTUNROLL: load i32
+;SWIFTUNROLL-NOT: load i32
+define i32 @register_limit(i32* nocapture %A, i32 %n) {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:
+  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
+  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ]
+  %sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ]
+  %sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ]
+  %sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i32 %i.02
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, %sum.01
+  %5 = add nsw i32 %i.02, 1
+  %6 = add nsw i32 %3, %sum.02
+  %7 = add nsw i32 %3, %sum.03
+  %8 = add nsw i32 %3, %sum.04
+  %9 = add nsw i32 %3, %sum.05
+  %10 = add nsw i32 %3, %sum.05
+  %exitcond = icmp eq i32 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
+  %sum.1.lcssa = phi i32 [ 0, %0 ], [ %6, %.lr.ph ]
+  %sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ]
+  %sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ]
+  %sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  %sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/gather-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/gather-cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/gather-cost.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/gather-cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,88 @@
+; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+ at kernel = global [512 x float] zeroinitializer, align 4
+ at kernel2 = global [512 x float] zeroinitializer, align 4
+ at kernel3 = global [512 x float] zeroinitializer, align 4
+ at kernel4 = global [512 x float] zeroinitializer, align 4
+ at src_data = global [1536 x float] zeroinitializer, align 4
+ at r_ = global i8 0, align 4
+ at g_ = global i8 0, align 4
+ at b_ = global i8 0, align 4
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive. This function represents a point where vectorization starts to
+; become beneficial.
+; Make sure we are conservative and don't vectorize it.
+; CHECK-NOT: <2 x float>
+; CHECK-NOT: <4 x float>
+
+define void @_Z4testmm(i32 %size, i32 %offset) {
+entry:
+  %cmp53 = icmp eq i32 %size, 0
+  br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+  %v.055 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+  %add = add i32 %v.055, %offset
+  %mul = mul i32 %add, 3
+  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %mul
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 %v.055
+  %1 = load float, float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %1
+  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 %v.055
+  %2 = load float, float* %arrayidx4, align 4
+  %mul5 = fmul fast float %mul3, %2
+  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 %v.055
+  %3 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul fast float %mul5, %3
+  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 %v.055
+  %4 = load float, float* %arrayidx8, align 4
+  %mul9 = fmul fast float %mul7, %4
+  %add10 = fadd fast float %r.057, %mul9
+  %arrayidx.sum = add i32 %mul, 1
+  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum
+  %5 = load float, float* %arrayidx11, align 4
+  %mul13 = fmul fast float %1, %5
+  %mul15 = fmul fast float %2, %mul13
+  %mul17 = fmul fast float %3, %mul15
+  %mul19 = fmul fast float %4, %mul17
+  %add20 = fadd fast float %g.056, %mul19
+  %arrayidx.sum52 = add i32 %mul, 2
+  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum52
+  %6 = load float, float* %arrayidx21, align 4
+  %mul23 = fmul fast float %1, %6
+  %mul25 = fmul fast float %2, %mul23
+  %mul27 = fmul fast float %3, %mul25
+  %mul29 = fmul fast float %4, %mul27
+  %add30 = fadd fast float %b.054, %mul29
+  %inc = add i32 %v.055, 1
+  %exitcond = icmp ne i32 %inc, %size
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  %add30.lcssa = phi float [ %add30, %for.body ]
+  %add20.lcssa = phi float [ %add20, %for.body ]
+  %add10.lcssa = phi float [ %add10, %for.body ]
+  %phitmp = fptoui float %add10.lcssa to i8
+  %phitmp60 = fptoui float %add20.lcssa to i8
+  %phitmp61 = fptoui float %add30.lcssa to i8
+  br label %for.end
+
+for.end:
+  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  store i8 %r.0.lcssa, i8* @r_, align 4
+  store i8 %g.0.lcssa, i8* @g_, align 4
+  store i8 %b.0.lcssa, i8* @b_, align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/gcc-examples.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/gcc-examples.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/gcc-examples.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,60 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+ at b = common global [2048 x i32] zeroinitializer, align 16
+ at c = common global [2048 x i32] zeroinitializer, align 16
+ at a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK-LABEL: @example1(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example10b(
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+  %3 = load i16, i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,147 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "armv8--linux-gnueabihf"
+
+%i8.2 = type {i8, i8}
+define void @i8_factor_2(%i8.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
+  %tmp2 = load i8, i8* %tmp0, align 1
+  %tmp3 = load i8, i8* %tmp1, align 1
+  store i8 0, i8* %tmp0, align 1
+  store i8 0, i8* %tmp1, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i16.2 = type {i16, i16}
+define void @i16_factor_2(%i16.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_4-LABEL:  Checking a loop in "i16_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
+  %tmp2 = load i16, i16* %tmp0, align 2
+  %tmp3 = load i16, i16* %tmp1, align 2
+  store i16 0, i16* %tmp0, align 2
+  store i16 0, i16* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i32.2 = type {i32, i32}
+define void @i32_factor_2(%i32.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  store i32 0, i32* %tmp0, align 4
+  store i32 0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%half.2 = type {half, half}
+define void @half_factor_2(%half.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_4-LABEL: Checking a loop in "half_factor_2"
+; VF_4:         Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT:    Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-LABEL: Checking a loop in "half_factor_2"
+; VF_8:         Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT:    Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1
+  %tmp2 = load half, half* %tmp0, align 2
+  %tmp3 = load half, half* %tmp1, align 2
+  store half 0., half* %tmp0, align 2
+  store half 0., half* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,114 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
+; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
+
+; ModuleID = 'arm.ll'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+%T216 = type <2 x i16>
+%T232 = type <2 x i32>
+%T264 = type <2 x i64>
+
+%T416 = type <4 x i16>
+%T432 = type <4 x i32>
+%T464 = type <4 x i64>
+
+define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'direct':
+  %v0 = load %T432, %T432* %loadaddr
+; ASM: vld1.64
+  %v1 = load %T432, %T432* %loadaddr2
+; ASM: vld1.64
+  %r3 = mul %T432 %v0, %v1 
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmul.i32
+  store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'ups1632':
+  %v0 = load %T416, %T416* %loadaddr
+; ASM: vldr
+  %v1 = load %T416, %T416* %loadaddr2
+; ASM: vldr
+  %r1 = sext %T416 %v0 to %T432
+  %r2 = sext %T416 %v1 to %T432
+; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
+  %r3 = mul %T432 %r1, %r2 
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmull.s16
+  store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'upu1632':
+  %v0 = load %T416, %T416* %loadaddr
+; ASM: vldr
+  %v1 = load %T416, %T416* %loadaddr2
+; ASM: vldr
+  %r1 = zext %T416 %v0 to %T432
+  %r2 = zext %T416 %v1 to %T432
+; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
+  %r3 = mul %T432 %r1, %r2 
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmull.u16
+  store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
+; COST: function 'ups3264':
+  %v0 = load %T232, %T232* %loadaddr
+; ASM: vldr
+  %v1 = load %T232, %T232* %loadaddr2
+; ASM: vldr
+  %r3 = mul %T232 %v0, %v1 
+; ASM: vmul.i32
+; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
+  %st = sext %T232 %r3 to %T264
+; ASM: vmovl.s32
+; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
+  store %T264 %st, %T264* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
+; COST: function 'upu3264':
+  %v0 = load %T232, %T232* %loadaddr
+; ASM: vldr
+  %v1 = load %T232, %T232* %loadaddr2
+; ASM: vldr
+  %r3 = mul %T232 %v0, %v1 
+; ASM: vmul.i32
+; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
+  %st = zext %T232 %r3 to %T264
+; ASM: vmovl.u32
+; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
+  store %T264 %st, %T264* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) {
+; COST: function 'dn3216':
+  %v0 = load %T432, %T432* %loadaddr
+; ASM: vld1.64
+  %v1 = load %T432, %T432* %loadaddr2
+; ASM: vld1.64
+  %r3 = mul %T432 %v0, %v1 
+; ASM: vmul.i32
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+  %st = trunc %T432 %r3 to %T416
+; ASM: vmovn.i32
+; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
+  store %T416 %st, %T416* %storeaddr
+; ASM: vstr
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/sphinx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/sphinx.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/sphinx.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/sphinx.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv8-unknown-unknown -mcpu=cortex-a53 -S | FileCheck %s
+
+; This test is reduced from SPECFP 2006 482.sphinx.
+; We expect vectorization with <2 x double> and <2 x float> ops.
+; See https://bugs.llvm.org/show_bug.cgi?id=36280 for more details.
+
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+ at a = external global i32
+ at v = external global i32
+ at mm = external global float**
+ at vv = external global float**
+ at ll = external global float*
+
+define i32 @test(float* nocapture readonly %x) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T:%.*]] = load i32, i32* @v, align 8
+; CHECK-NEXT:    [[T1:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    br label [[OUTERLOOP:%.*]]
+; CHECK:       outerloop:
+; CHECK-NEXT:    [[T2:%.*]] = phi i32 [ [[V17:%.*]], [[OUTEREND:%.*]] ], [ [[T1]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[J_0136:%.*]] = phi i32 [ [[INC144:%.*]], [[OUTEREND]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCORE_1135:%.*]] = phi i32 [ [[CALL142:%.*]], [[OUTEREND]] ], [ -939524096, [[ENTRY]] ]
+; CHECK-NEXT:    [[T3:%.*]] = load float**, float*** @mm, align 4
+; CHECK-NEXT:    [[ARRAYIDX109:%.*]] = getelementptr inbounds float*, float** [[T3]], i32 [[T2]]
+; CHECK-NEXT:    [[T4:%.*]] = load float*, float** [[ARRAYIDX109]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load float**, float*** @vv, align 4
+; CHECK-NEXT:    [[ARRAYIDX111:%.*]] = getelementptr inbounds float*, float** [[T5]], i32 [[T2]]
+; CHECK-NEXT:    [[T6:%.*]] = load float*, float** [[ARRAYIDX111]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load float*, float** @ll, align 4
+; CHECK-NEXT:    [[ARRAYIDX113:%.*]] = getelementptr inbounds float, float* [[T7]], i32 [[T2]]
+; CHECK-NEXT:    [[T8:%.*]] = load float, float* [[ARRAYIDX113]], align 4
+; CHECK-NEXT:    [[CONV114:%.*]] = fpext float [[T8]] to double
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[T]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[T]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> zeroinitializer, double [[CONV114]], i32 0
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double>
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP16]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERLOOP]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[INNERLOOP:%.*]]
+; CHECK:       innerloop:
+; CHECK-NEXT:    [[I_2132:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC129:%.*]], [[INNERLOOP]] ]
+; CHECK-NEXT:    [[DVAL1_4131:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB127:%.*]], [[INNERLOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX119:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_2132]]
+; CHECK-NEXT:    [[T9:%.*]] = load float, float* [[ARRAYIDX119]], align 4
+; CHECK-NEXT:    [[ARRAYIDX120:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[I_2132]]
+; CHECK-NEXT:    [[T10:%.*]] = load float, float* [[ARRAYIDX120]], align 4
+; CHECK-NEXT:    [[SUB121:%.*]] = fsub fast float [[T9]], [[T10]]
+; CHECK-NEXT:    [[CONV122:%.*]] = fpext float [[SUB121]] to double
+; CHECK-NEXT:    [[MUL123:%.*]] = fmul fast double [[CONV122]], [[CONV122]]
+; CHECK-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[I_2132]]
+; CHECK-NEXT:    [[T11:%.*]] = load float, float* [[ARRAYIDX124]], align 4
+; CHECK-NEXT:    [[CONV125:%.*]] = fpext float [[T11]] to double
+; CHECK-NEXT:    [[MUL126:%.*]] = fmul fast double [[MUL123]], [[CONV125]]
+; CHECK-NEXT:    [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]]
+; CHECK-NEXT:    [[INC129]] = add nuw nsw i32 [[I_2132]], 1
+; CHECK-NEXT:    [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]]
+; CHECK-NEXT:    br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop !2
+; CHECK:       outerend:
+; CHECK-NEXT:    [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32
+; CHECK-NEXT:    [[CALL142]] = add nuw nsw i32 [[SCORE_1135]], [[CONV138]]
+; CHECK-NEXT:    [[INC144]] = add nuw nsw i32 [[J_0136]], 1
+; CHECK-NEXT:    [[ARRAYIDX102:%.*]] = getelementptr inbounds i32, i32* @a, i32 [[INC144]]
+; CHECK-NEXT:    [[V17]] = load i32, i32* [[ARRAYIDX102]], align 4
+; CHECK-NEXT:    [[CMP103:%.*]] = icmp sgt i32 [[V17]], -1
+; CHECK-NEXT:    br i1 [[CMP103]], label [[OUTERLOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 [[CALL142]]
+;
+entry:
+  %t = load i32, i32* @v, align 8
+  %t1 = load i32, i32* @a, align 4
+  br label %outerloop
+
+outerloop:
+  %t2 = phi i32 [ %v17, %outerend ], [ %t1, %entry ]
+  %j.0136 = phi i32 [ %inc144, %outerend ], [ 0, %entry ]
+  %score.1135 = phi i32 [ %call142, %outerend ], [ -939524096, %entry ]
+  %t3 = load float**, float*** @mm, align 4
+  %arrayidx109 = getelementptr inbounds float*, float** %t3, i32 %t2
+  %t4 = load float*, float** %arrayidx109, align 4
+  %t5 = load float**, float*** @vv, align 4
+  %arrayidx111 = getelementptr inbounds float*, float** %t5, i32 %t2
+  %t6 = load float*, float** %arrayidx111, align 4
+  %t7 = load float*, float** @ll, align 4
+  %arrayidx113 = getelementptr inbounds float, float* %t7, i32 %t2
+  %t8 = load float, float* %arrayidx113, align 4
+  %conv114 = fpext float %t8 to double
+  br label %innerloop
+
+innerloop:
+  %i.2132 = phi i32 [ 0, %outerloop ], [ %inc129, %innerloop ]
+  %dval1.4131 = phi double [ %conv114, %outerloop ], [ %sub127, %innerloop ]
+  %arrayidx119 = getelementptr inbounds float, float* %x, i32 %i.2132
+  %t9 = load float, float* %arrayidx119, align 4
+  %arrayidx120 = getelementptr inbounds float, float* %t4, i32 %i.2132
+  %t10 = load float, float* %arrayidx120, align 4
+  %sub121 = fsub fast float %t9, %t10
+  %conv122 = fpext float %sub121 to double
+  %mul123 = fmul fast double %conv122, %conv122
+  %arrayidx124 = getelementptr inbounds float, float* %t6, i32 %i.2132
+  %t11 = load float, float* %arrayidx124, align 4
+  %conv125 = fpext float %t11 to double
+  %mul126 = fmul fast double %mul123, %conv125
+  %sub127 = fsub fast double %dval1.4131, %mul126
+  %inc129 = add nuw nsw i32 %i.2132, 1
+  %exitcond143 = icmp eq i32 %inc129, %t
+  br i1 %exitcond143, label %outerend, label %innerloop
+
+outerend:
+  %sub127.lcssa = phi double [ %sub127, %innerloop ]
+  %conv138 = fptosi double %sub127.lcssa to i32
+  %call142 = add nuw nsw i32 %score.1135, %conv138
+  %inc144 = add nuw nsw i32 %j.0136, 1
+  %arrayidx102 = getelementptr inbounds i32, i32* @a, i32 %inc144
+  %v17 = load i32, i32* %arrayidx102, align 4
+  %cmp103 = icmp sgt i32 %v17, -1
+  br i1 %cmp103, label %outerloop, label %exit
+
+exit:
+  ret i32 %call142
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/vector_cast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/vector_cast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/vector_cast.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/vector_cast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; RUN: opt -loop-vectorize -tbaa -S -mattr=+neon < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabi"
+
+; This requires the loop vectorizer to create an interleaved access group
+; for the stores to the struct. Here we need to perform a bitcast from a vector
+; of pointers to a vector i32s.
+
+%class.A = type { i8*, i32 }
+
+; CHECK-LABEL: test0
+define void @test0(%class.A* %StartPtr, %class.A* %APtr) {
+entry:
+  br label %for.body.i
+
+for.body.i:
+  %addr = phi %class.A* [ %StartPtr, %entry ], [ %incdec.ptr.i, %for.body.i ]
+  %Data.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 0
+  store i8* null, i8** %Data.i.i, align 4, !tbaa !8
+  %Length.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 1
+  store i32 0, i32* %Length.i.i, align 4, !tbaa !11
+  %incdec.ptr.i = getelementptr inbounds %class.A, %class.A* %addr, i32 1
+  %cmp.i = icmp eq %class.A* %incdec.ptr.i, %APtr
+  br i1 %cmp.i, label %exit, label %for.body.i
+
+exit:
+  ret void
+}
+
+!5 = !{!"any pointer", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!9, !5, i64 0}
+!9 = !{!"some struct", !5, i64 0, !10, i64 4}
+!10 = !{!"int", !6, i64 0}
+!11 = !{!9, !10, i64 4}

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/width-detect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/width-detect.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/width-detect.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/width-detect.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,52 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK:foo_F32
+;CHECK: <4 x float>
+;CHECK:ret
+define float @foo_F32(float* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.01 = phi float [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
+  %2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %3 = load float, float* %2, align 8
+  %4 = fmul fast float %prod.01, %3
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
+  ret float %prod.0.lcssa
+}
+
+;CHECK:foo_I8
+;CHECK: xor <16 x i8>
+;CHECK:ret
+define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  %3 = load i8, i8* %2, align 1
+  %4 = xor i8 %3, %red.01
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ]
+  ret i8 %red.0.lcssa
+}
+
+

Added: llvm/trunk/test/Transforms/LoopVectorize/Hexagon/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/Hexagon/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/Hexagon/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/Hexagon/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'Hexagon' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,173 @@
+; RUN: opt -march=hexagon -loop-vectorize -hexagon-autohvx -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Check that TTI::getMinimumVF works. The calculated MaxVF was based on the
+; register pressure and was less than 64.
+; CHECK: LV: Overriding calculated MaxVF({{[0-9]+}}) with target's minimum: 64
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+%s.0 = type { i8*, i32, i32, i32, i32 }
+
+ at g0 = external dso_local local_unnamed_addr global %s.0**, align 4
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+define hidden fastcc void @f0(i8* nocapture %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i8 zeroext %a5) unnamed_addr #1 {
+b0:
+  %v0 = alloca [4 x [9 x i16]], align 8
+  %v1 = bitcast [4 x [9 x i16]]* %v0 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 72, i8* nonnull %v1) #2
+  %v2 = add i32 %a1, -2
+  %v3 = add i32 %a3, -9
+  %v4 = icmp ugt i32 %v2, %v3
+  %v5 = add i32 %a2, -2
+  %v6 = add i32 %a4, -9
+  %v7 = icmp ugt i32 %v5, %v6
+  %v8 = or i1 %v4, %v7
+  %v9 = load %s.0**, %s.0*** @g0, align 4, !tbaa !1
+  %v10 = zext i8 %a5 to i32
+  %v11 = getelementptr inbounds %s.0*, %s.0** %v9, i32 %v10
+  %v12 = load %s.0*, %s.0** %v11, align 4, !tbaa !1
+  %v13 = getelementptr inbounds %s.0, %s.0* %v12, i32 0, i32 0
+  %v14 = load i8*, i8** %v13, align 4, !tbaa !5
+  br i1 %v8, label %b1, label %b2
+
+b1:                                               ; preds = %b1, %b0
+  %v15 = phi i32 [ 0, %b0 ], [ %v119, %b1 ]
+  %v16 = add i32 %v5, %v15
+  %v17 = icmp slt i32 %v16, 0
+  %v18 = icmp slt i32 %v16, %a4
+  %v19 = select i1 %v18, i32 %v16, i32 %v3
+  %v20 = select i1 %v17, i32 0, i32 %v19
+  %v21 = mul i32 %v20, %a3
+  %v22 = add i32 97, %v21
+  %v23 = getelementptr inbounds i8, i8* %v14, i32 %v22
+  %v24 = load i8, i8* %v23, align 1, !tbaa !8
+  %v25 = zext i8 %v24 to i32
+  %v26 = add i32 101, %v21
+  %v27 = getelementptr inbounds i8, i8* %v14, i32 %v26
+  %v28 = load i8, i8* %v27, align 1, !tbaa !8
+  %v29 = zext i8 %v28 to i32
+  %v30 = mul nsw i32 %v29, -5
+  %v31 = add nsw i32 %v30, %v25
+  %v32 = add i32 106, %v21
+  %v33 = getelementptr inbounds i8, i8* %v14, i32 %v32
+  %v34 = load i8, i8* %v33, align 1, !tbaa !8
+  %v35 = zext i8 %v34 to i32
+  %v36 = mul nuw nsw i32 %v35, 20
+  %v37 = add nsw i32 %v36, %v31
+  %v38 = add i32 111, %v21
+  %v39 = getelementptr inbounds i8, i8* %v14, i32 %v38
+  %v40 = load i8, i8* %v39, align 1, !tbaa !8
+  %v41 = zext i8 %v40 to i32
+  %v42 = mul nuw nsw i32 %v41, 20
+  %v43 = add nsw i32 %v42, %v37
+  %v44 = add i32 116, %v21
+  %v45 = getelementptr inbounds i8, i8* %v14, i32 %v44
+  %v46 = load i8, i8* %v45, align 1, !tbaa !8
+  %v47 = zext i8 %v46 to i32
+  %v48 = mul nsw i32 %v47, -5
+  %v49 = add nsw i32 %v48, %v43
+  %v50 = add i32 120, %v21
+  %v51 = getelementptr inbounds i8, i8* %v14, i32 %v50
+  %v52 = load i8, i8* %v51, align 1, !tbaa !8
+  %v53 = zext i8 %v52 to i32
+  %v54 = add nsw i32 %v49, %v53
+  %v55 = trunc i32 %v54 to i16
+  %v56 = getelementptr inbounds [4 x [9 x i16]], [4 x [9 x i16]]* %v0, i32 0, i32 0, i32 %v15
+  store i16 %v55, i16* %v56, align 2, !tbaa !9
+  %v57 = mul nsw i32 %v35, -5
+  %v58 = add nsw i32 %v57, %v29
+  %v59 = add nsw i32 %v42, %v58
+  %v60 = mul nuw nsw i32 %v47, 20
+  %v61 = add nsw i32 %v60, %v59
+  %v62 = mul nsw i32 %v53, -5
+  %v63 = add nsw i32 %v62, %v61
+  %v64 = add i32 125, %v21
+  %v65 = getelementptr inbounds i8, i8* %v14, i32 %v64
+  %v66 = load i8, i8* %v65, align 1, !tbaa !8
+  %v67 = zext i8 %v66 to i32
+  %v68 = add nsw i32 %v63, %v67
+  %v69 = trunc i32 %v68 to i16
+  %v70 = getelementptr inbounds [4 x [9 x i16]], [4 x [9 x i16]]* %v0, i32 0, i32 1, i32 %v15
+  store i16 %v69, i16* %v70, align 2, !tbaa !9
+  %v71 = mul nsw i32 %v41, -5
+  %v72 = add nsw i32 %v71, %v35
+  %v73 = add nsw i32 %v60, %v72
+  %v74 = mul nuw nsw i32 %v53, 20
+  %v75 = add nsw i32 %v74, %v73
+  %v76 = mul nsw i32 %v67, -5
+  %v77 = add nsw i32 %v76, %v75
+  %v78 = add i32 130, %v21
+  %v79 = getelementptr inbounds i8, i8* %v14, i32 %v78
+  %v80 = load i8, i8* %v79, align 1, !tbaa !8
+  %v81 = zext i8 %v80 to i32
+  %v82 = add nsw i32 %v77, %v81
+  %v83 = trunc i32 %v82 to i16
+  %v84 = getelementptr inbounds [4 x [9 x i16]], [4 x [9 x i16]]* %v0, i32 0, i32 2, i32 %v15
+  store i16 %v83, i16* %v84, align 2, !tbaa !9
+  %v85 = add i32 92, %v21
+  %v86 = getelementptr inbounds i8, i8* %v14, i32 %v85
+  %v87 = load i8, i8* %v86, align 1, !tbaa !8
+  %v88 = zext i8 %v87 to i16
+  %v89 = add i32 135, %v21
+  %v90 = getelementptr inbounds i8, i8* %v14, i32 %v89
+  %v91 = load i8, i8* %v90, align 1, !tbaa !8
+  %v92 = zext i8 %v91 to i16
+  %v93 = mul nsw i16 %v92, -5
+  %v94 = add nsw i16 %v93, %v88
+  %v95 = add i32 140, %v21
+  %v96 = getelementptr inbounds i8, i8* %v14, i32 %v95
+  %v97 = load i8, i8* %v96, align 1, !tbaa !8
+  %v98 = zext i8 %v97 to i16
+  %v99 = mul nuw nsw i16 %v98, 20
+  %v100 = add nsw i16 %v99, %v94
+  %v101 = add i32 145, %v21
+  %v102 = getelementptr inbounds i8, i8* %v14, i32 %v101
+  %v103 = load i8, i8* %v102, align 1, !tbaa !8
+  %v104 = zext i8 %v103 to i16
+  %v105 = mul nuw nsw i16 %v104, 20
+  %v106 = add i16 %v105, %v100
+  %v107 = add i32 150, %v21
+  %v108 = getelementptr inbounds i8, i8* %v14, i32 %v107
+  %v109 = load i8, i8* %v108, align 1, !tbaa !8
+  %v110 = zext i8 %v109 to i16
+  %v111 = mul nsw i16 %v110, -5
+  %v112 = add i16 %v111, %v106
+  %v113 = add i32 154, %v21
+  %v114 = getelementptr inbounds i8, i8* %v14, i32 %v113
+  %v115 = load i8, i8* %v114, align 1, !tbaa !8
+  %v116 = zext i8 %v115 to i16
+  %v117 = add i16 %v112, %v116
+  %v118 = getelementptr inbounds [4 x [9 x i16]], [4 x [9 x i16]]* %v0, i32 0, i32 3, i32 %v15
+  store i16 %v117, i16* %v118, align 2, !tbaa !9
+  %v119 = add nuw nsw i32 %v15, 1
+  %v120 = icmp eq i32 %v119, 19
+  br i1 %v120, label %b2, label %b1
+
+b2:                                               ; preds = %b1, %b0
+  call void @llvm.lifetime.end.p0i8(i64 72, i8* nonnull %v1) #2
+  ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length64b,+hvxv60" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"any pointer", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !2, i64 0}
+!6 = !{!"", !2, i64 0, !7, i64 4, !7, i64 8, !7, i64 12, !7, i64 16}
+!7 = !{!"int", !3, i64 0}
+!8 = !{!3, !3, i64 0}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"short", !3, i64 0}

Added: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %c) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @foo
+; CHECK: fmul <4 x double> %{{[^,]+}}, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
+; CHECK-NEXT: fmul <4 x double> %{{[^,]+}}, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %mul = fmul double %0, 2.000000e+00
+  %mul3 = fmul double %0, %mul
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 %indvars.iv
+  %1 = load double, double* %arrayidx5, align 8
+  %mul6 = fmul double %1, 3.000000e+00
+  %mul9 = fmul double %1, %mul6
+  %add = fadd double %mul3, %mul9
+  %mul12 = fmul double %0, 4.000000e+00
+  %mul15 = fmul double %mul12, %1
+  %add16 = fadd double %mul15, %add
+  %add17 = fadd double %add16, 1.000000e+00
+  %arrayidx19 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add17, double* %arrayidx19, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind "target-cpu"="a2q" }
+

Added: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,75 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; CHECK: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: =
+; CHECK-NOT: fadd
+; CHECK-SAME: >
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-ibm-linux-gnu"
+
+define void @QLA_F3_r_veq_norm2_V(float* noalias nocapture %r, [3 x { float, float }]* noalias nocapture readonly %a, i32 signext %n) #0 {
+entry:
+  %cmp24 = icmp sgt i32 %n, 0
+  br i1 %cmp24, label %for.cond1.preheader.preheader, label %for.end13
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+  %sum.026 = phi double [ %add10.2, %for.cond1.preheader ], [ 0.000000e+00, %for.cond1.preheader.preheader ]
+  %arrayidx5.realp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 0
+  %arrayidx5.real = load float, float* %arrayidx5.realp, align 8
+  %arrayidx5.imagp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 1
+  %arrayidx5.imag = load float, float* %arrayidx5.imagp, align 8
+  %mul = fmul fast float %arrayidx5.real, %arrayidx5.real
+  %mul9 = fmul fast float %arrayidx5.imag, %arrayidx5.imag
+  %add = fadd fast float %mul9, %mul
+  %conv = fpext float %add to double
+  %add10 = fadd fast double %conv, %sum.026
+  %arrayidx5.realp.1 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 0
+  %arrayidx5.real.1 = load float, float* %arrayidx5.realp.1, align 8
+  %arrayidx5.imagp.1 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 1
+  %arrayidx5.imag.1 = load float, float* %arrayidx5.imagp.1, align 8
+  %mul.1 = fmul fast float %arrayidx5.real.1, %arrayidx5.real.1
+  %mul9.1 = fmul fast float %arrayidx5.imag.1, %arrayidx5.imag.1
+  %add.1 = fadd fast float %mul9.1, %mul.1
+  %conv.1 = fpext float %add.1 to double
+  %add10.1 = fadd fast double %conv.1, %add10
+  %arrayidx5.realp.2 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 0
+  %arrayidx5.real.2 = load float, float* %arrayidx5.realp.2, align 8
+  %arrayidx5.imagp.2 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 1
+  %arrayidx5.imag.2 = load float, float* %arrayidx5.imagp.2, align 8
+  %mul.2 = fmul fast float %arrayidx5.real.2, %arrayidx5.real.2
+  %mul9.2 = fmul fast float %arrayidx5.imag.2, %arrayidx5.imag.2
+  %add.2 = fadd fast float %mul9.2, %mul.2
+  %conv.2 = fpext float %add.2 to double
+  %add10.2 = fadd fast double %conv.2, %add10.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.end13_crit_edge, label %for.cond1.preheader
+
+for.cond.for.end13_crit_edge:                     ; preds = %for.cond1.preheader
+  %add10.2.lcssa = phi double [ %add10.2, %for.cond1.preheader ]
+  %phitmp = fptrunc double %add10.2.lcssa to float
+  br label %for.end13
+
+for.end13:                                        ; preds = %for.cond.for.end13_crit_edge, %entry
+  %sum.0.lcssa = phi float [ %phitmp, %for.cond.for.end13_crit_edge ], [ 0.000000e+00, %entry ]
+  store float %sum.0.lcssa, float* %r, align 4
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/PowerPC/pr30990.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/pr30990.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/pr30990.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,140 @@
+; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -force-vector-interleave=1 -vectorizer-maximize-bandwidth=0 -S | FileCheck %s
+
+target triple = "powerpc64-unknown-linux-gnu"
+
+define signext i32 @foo(i8* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64
+  %add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i8, i8* %ptr.addr.08, align 1
+  %cmp1 = icmp slt i8 %0, -64
+  %cond = zext i1 %cmp1 to i32
+  %add = add nsw i32 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1
+  %cmp = icmp ult i8* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i32 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i32 %count.0.lcssa
+
+; CHECK: load <4 x i8>
+; CHECK: icmp slt <4 x i8>
+}
+
+
+define signext i16 @foo2(i8* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64 
+  %add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i16 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i8, i8* %ptr.addr.08, align 1
+  %cmp1 = icmp slt i8 %0, -64 
+  %cond = zext i1 %cmp1 to i16 
+  %add = add nsw i16 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1
+  %cmp = icmp ult i8* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i16 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i16 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i16 %count.0.lcssa
+
+; CHECK-LABEL: foo2
+; CHECK: load <8 x i8>
+; CHECK: icmp slt <8 x i8>
+}
+
+define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64 
+  %add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i16, i16* %ptr.addr.16, align 1
+  %cmp1 = icmp slt i16 %0, -64 
+  %cond = zext i1 %cmp1 to i32 
+  %add = add nsw i32 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
+  %cmp = icmp ult i16* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i32 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i32 %count.0.lcssa
+
+; CHECK-LABEL: foo3
+; CHECK: load <4 x i16>
+; CHECK: icmp slt <4 x i16>
+}
+
+define i64 @foo4(i16* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64 
+  %add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i16, i16* %ptr.addr.16, align 1
+  %cmp1 = icmp slt i16 %0, -64 
+  %cond = zext i1 %cmp1 to i64 
+  %add = add nsw i64 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
+  %cmp = icmp ult i16* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i64 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i64 %count.0.lcssa
+
+; CHECK-LABEL: foo4
+; CHECK: load <2 x i16>
+; CHECK: icmp slt <2 x i16>
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,49 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; CHECK: vector.body:
+; CHECK: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NOT: fadd
+; CHECK: middle.block
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-ibm-linux-gnu"
+
+define void @test(double* nocapture readonly %arr, i32 signext %len) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %len, 0
+  br i1 %cmp4, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = add i32 %len, -1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %redx.05 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %arr, i64 %indvars.iv
+  %1 = load double, double* %arrayidx, align 8
+  %add = fadd fast double %1, %redx.05
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %redx.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @foo
+; CHECK: <2 x double>
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %odd.idx = add nsw i64 %0, 1
+
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %0
+  %arrayidx.odd = getelementptr inbounds double, double* %b, i64 %odd.idx
+
+  %1 = load double, double* %arrayidx, align 8
+  %2 = load double, double* %arrayidx.odd, align 8
+
+  %add = fadd double %1, %2
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind "target-cpu"="pwr8" }
+

Added: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,62 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; Function Attrs: nounwind
+define zeroext i32 @test() #0 {
+; CHECK-LABEL: @test
+; CHECK-NOT: x i32>
+
+entry:
+  %a = alloca [1600 x i32], align 4
+  %c = alloca [1600 x i32], align 4
+  %0 = bitcast [1600 x i32]* %a to i8*
+  call void @llvm.lifetime.start(i64 6400, i8* %0) #3
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %1 = bitcast [1600 x i32]* %c to i8*
+  call void @llvm.lifetime.start(i64 6400, i8* %1) #3
+  %arraydecay = getelementptr inbounds [1600 x i32], [1600 x i32]* %a, i64 0, i64 0
+  %arraydecay1 = getelementptr inbounds [1600 x i32], [1600 x i32]* %c, i64 0, i64 0
+  %call = call signext i32 @bar(i32* %arraydecay, i32* %arraydecay1) #3
+  br label %for.body6
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.body ]
+  %arrayidx = getelementptr inbounds [1600 x i32], [1600 x i32]* %a, i64 0, i64 %indvars.iv25
+  %2 = trunc i64 %indvars.iv25 to i32
+  store i32 %2, i32* %arrayidx, align 4
+  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+  %exitcond27 = icmp eq i64 %indvars.iv.next26, 1600
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup5:                                ; preds = %for.body6
+  call void @llvm.lifetime.end(i64 6400, i8* nonnull %1) #3
+  call void @llvm.lifetime.end(i64 6400, i8* %0) #3
+  ret i32 %add
+
+for.body6:                                        ; preds = %for.body6, %for.cond.cleanup
+  %indvars.iv = phi i64 [ 0, %for.cond.cleanup ], [ %indvars.iv.next, %for.body6 ]
+  %s.022 = phi i32 [ 0, %for.cond.cleanup ], [ %add, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [1600 x i32], [1600 x i32]* %c, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx8, align 4
+  %add = add i32 %3, %s.022
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.cond.cleanup5, label %for.body6
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+declare signext i32 @bar(i32*, i32*) #2
+
+attributes #0 = { nounwind "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" }
+attributes #3 = { nounwind }
+

Added: llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,51 @@
+; RUN: opt < %s -mcpu=pwr7 -mattr=+vsx -loop-vectorize -instcombine -S | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.GlobalData = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float], [5 x i32], [12 x i8], [32000 x float], [7 x i32], [4 x i8], [32000 x float], [11 x i32], [4 x i8], [32000 x float], [13 x i32], [12 x i8], [256 x [256 x float]], [17 x i32], [12 x i8], [256 x [256 x float]], [19 x i32], [4 x i8], [256 x [256 x float]], [23 x i32], [4 x i8], [256 x [256 x float]] }
+
+ at global_data = external global %struct.GlobalData, align 16
+ at ntimes = external hidden unnamed_addr global i32, align 4
+
+define signext i32 @s173() #0 {
+entry:
+  %0 = load i32, i32* @ntimes, align 4
+  %cmp21 = icmp sgt i32 %0, 0
+  br i1 %cmp21, label %for.cond1.preheader, label %for.end12
+
+for.cond1.preheader:                              ; preds = %for.end, %entry
+  %nl.022 = phi i32 [ %inc11, %for.end ], [ 0, %entry ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %arrayidx5 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %indvars.iv
+  %2 = load float, float* %arrayidx5, align 4
+  %add = fadd float %1, %2
+  %3 = add nsw i64 %indvars.iv, 16000
+  %arrayidx8 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %3
+  store float %add, float* %arrayidx8, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16000
+  br i1 %exitcond, label %for.end, label %for.body3
+
+for.end:                                          ; preds = %for.body3
+  %inc11 = add nsw i32 %nl.022, 1
+  %4 = load i32, i32* @ntimes, align 4
+  %mul = mul nsw i32 %4, 10
+  %cmp = icmp slt i32 %inc11, %mul
+  br i1 %cmp, label %for.cond1.preheader, label %for.end12
+
+for.end12:                                        ; preds = %for.end, %entry
+  ret i32 0
+
+; CHECK-LABEL: @s173
+; CHECK: load <4 x float>, <4 x float>*
+; CHECK: add nsw i64 %index, 16000
+; CHECK: ret i32 0
+}
+
+attributes #0 = { nounwind }
+

Added: llvm/trunk/test/Transforms/LoopVectorize/SystemZ/addressing.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/SystemZ/addressing.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/SystemZ/addressing.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/SystemZ/addressing.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,72 @@
+; RUN: opt -S  -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -dce \
+; RUN:   -instcombine -force-vector-width=2  < %s | FileCheck %s
+;
+; Test that loop vectorizer does not generate vector addresses that must then
+; always be extracted.
+
+; Check that the addresses for a scalarized memory access is not extracted
+; from a vector register.
+define i32 @foo(i32* nocapture %A) {
+;CHECK-LABEL: @foo(
+;CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK:  %0 = shl nsw i64 %index, 2
+;CHECK:  %1 = shl i64 %index, 2
+;CHECK:  %2 = or i64 %1, 4
+;CHECK:  %3 = getelementptr inbounds i32, i32* %A, i64 %0
+;CHECK:  %4 = getelementptr inbounds i32, i32* %A, i64 %2
+;CHECK:  store i32 4, i32* %3, align 4
+;CHECK:  store i32 4, i32* %4, align 4
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0
+  store i32 4, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
+
+
+; Check that a load of address is scalarized.
+define i32 @foo1(i32* nocapture noalias %A, i32** nocapture %PtrPtr) {
+;CHECK-LABEL: @foo1(
+;CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK:  %0 = or i64 %index, 1
+;CHECK:  %1 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %index
+;CHECK:  %2 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %0
+;CHECK:  %3 = load i32*, i32** %1, align 8
+;CHECK:  %4 = load i32*, i32** %2, align 8
+;CHECK:  %5 = load i32, i32* %3, align 4
+;CHECK:  %6 = load i32, i32* %4, align 4
+;CHECK:  %7 = insertelement <2 x i32> undef, i32 %5, i32 0
+;CHECK:  %8 = insertelement <2 x i32> %7, i32 %6, i32 1
+;CHECK:  %9 = getelementptr inbounds i32, i32* %A, i64 %index
+;CHECK:  %10 = bitcast i32* %9 to <2 x i32>*
+;CHECK:  store <2 x i32> %8, <2 x i32>* %10, align 4
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr = getelementptr inbounds i32*, i32** %PtrPtr, i64 %indvars.iv
+  %el = load i32*, i32** %ptr
+  %v = load i32, i32* %el
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %v, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+
+; Check costs for branches inside a vectorized loop around predicated
+; blocks. Each such branch will be guarded with an extractelement from the
+; vector compare plus a test under mask instruction. This cost is modelled on
+; the extractelement of i1.
+
+define void @fun(i32* %arr, i64 %trip.count) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
+  %l = load i32, i32* %arrayidx, align 4
+  %cmp55 = icmp sgt i32 %l, 0
+  br i1 %cmp55, label %if.then, label %for.inc
+
+if.then:
+  %sub = sub nsw i32 0, %l
+  store i32 %sub, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  ret void
+
+; CHECK: LV: Found an estimated cost of 5 for VF 2 For instruction:   br i1 %cmp55, label %if.then, label %for.inc
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond, label %for.end.loopexit, label %for.body
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'SystemZ' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that a scalarized load does not get operands scalarization costs added.
+
+define void @fun(i64* %data, i64 %n, i64 %s, double* %Src) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %mul = mul nsw i64 %iv, %s
+  %gep = getelementptr inbounds double, double* %Src, i64 %mul
+  %bct = bitcast double* %gep to i64*
+  %ld = load i64, i64* %bct
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cmp110.us = icmp slt i64 %iv.next, %n
+  br i1 %cmp110.us, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %mul = mul nsw i64 %iv, %s
+; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %ld = load i64, i64* %bct
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -enable-interleaved-mem-accesses=false -disable-output < %s 2>&1 \
+; RUN:   | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that a scalarized load does not get a zero cost in a vectorized
+; loop. It can only be folded into the add operand in the scalar loop.
+
+define i32 @fun(i64* %data, i64 %n, i64 %s, i32* %Src) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %acc = phi i32 [ 0, %entry ], [ %acc_next, %for.body ]
+  %gep = getelementptr inbounds i32, i32* %Src, i64 %iv
+  %ld = load i32, i32* %gep
+  %acc_next = add i32 %acc, %ld
+  %iv.next = add nuw nsw i64 %iv, 2
+  %cmp110.us = icmp slt i64 %iv.next, %n
+  br i1 %cmp110.us, label %for.body, label %for.end
+
+for.end:
+  ret i32 %acc_next
+
+; CHECK: Found an estimated cost of 4 for VF 4 For instruction:   %ld = load i32, i32* %gep
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \
+; RUN:   FileCheck %s
+;
+; Check that a scalarized load/store does not get a cost for insterts/
+; extracts, since z13 supports element load/store.
+
+define void @fun(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Scalarizing:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Scalarizing:  store i32 %tmp2, i32* %tmp0, align 4
+
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp2, i32* %tmp0, align 4
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,149 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -debug-only=loop-vectorize,vectorutils -max-interleave-group-factor=64\
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that some cost estimations for interleave groups make sense.
+
+; This loop is loading four i16 values at indices [0, 1, 2, 3], with a stride
+; of 4. At VF=4, memory interleaving means loading 4 * 4 * 16 bits = 2 vector
+; registers. Each of the 4 vector values must then be constructed from the
+; two vector registers using one vperm each, which gives a cost of 2 + 4 = 6.
+;
+; CHECK: LV: Checking a loop in "fun0"
+; CHECK: LV: Found an estimated cost of 6 for VF 4 For instruction:   %ld0 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld1 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld2 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld3 = load i16
+define void @fun0(i16 *%ptr, i16 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i16* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i16, i16* %ivptr, i64 0
+  %ld0 = load i16, i16* %ptr0
+  %ptr1 = getelementptr inbounds i16, i16* %ivptr, i64 1
+  %ld1 = load i16, i16* %ptr1
+  %ptr2 = getelementptr inbounds i16, i16* %ivptr, i64 2
+  %ld2 = load i16, i16* %ptr2
+  %ptr3 = getelementptr inbounds i16, i16* %ivptr, i64 3
+  %ld3 = load i16, i16* %ptr3
+  %a1 = add i16 %ld0, %ld1
+  %a2 = add i16 %a1, %ld2
+  %a3 = add i16 %a2, %ld3
+  %dstptr = getelementptr inbounds i16, i16* %dst, i64 %iv
+  store i16 %a3, i16* %dstptr
+  %ptr.next = getelementptr inbounds i16, i16* %ivptr, i64 4
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop loads one i8 value in a stride of 3. At VF=16, this means loading
+; 3 vector registers, and then constructing the vector value with two vperms,
+; which gives a cost of 5.
+;
+; CHECK: LV: Checking a loop in "fun1"
+; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction:   %ld0 = load i8
+define void @fun1(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %ld0, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop is loading 4 i8 values at indexes [0, 1, 2, 3], with a stride of
+; 32. At VF=2, this means loading 2 vector registers, and using 4 vperms to
+; produce the vector values, which gives a cost of 6.
+;
+; CHECK: LV: Checking a loop in "fun2"
+; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %ld0 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+define void @fun2(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %ptr1 = getelementptr inbounds i8, i8* %ivptr, i64 1
+  %ld1 = load i8, i8* %ptr1
+  %ptr2 = getelementptr inbounds i8, i8* %ivptr, i64 2
+  %ld2 = load i8, i8* %ptr2
+  %ptr3 = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %ld3 = load i8, i8* %ptr3
+  %a1 = add i8 %ld0, %ld1
+  %a2 = add i8 %a1, %ld2
+  %a3 = add i8 %a2, %ld3
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %a3, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 32
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop is loading 4 i8 values at indexes [0, 1, 2, 3], with a stride of
+; 30. At VF=2, this means loading 3 vector registers, and using 4 vperms to
+; produce the vector values, which gives a cost of 7. This is the same loop
+; as in fun2, except the stride makes the second iterations values overlap a
+; vector register boundary.
+;
+; CHECK: LV: Checking a loop in "fun3"
+; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction:   %ld0 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+define void @fun3(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %ptr1 = getelementptr inbounds i8, i8* %ivptr, i64 1
+  %ld1 = load i8, i8* %ptr1
+  %ptr2 = getelementptr inbounds i8, i8* %ivptr, i64 2
+  %ld2 = load i8, i8* %ptr2
+  %ptr3 = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %ld3 = load i8, i8* %ptr3
+  %a1 = add i8 %ld0, %ld1
+  %a2 = add i8 %a1, %ld2
+  %a3 = add i8 %a2, %ld3
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %a3, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 30
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,70 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize,vectorutils \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that the loop vectorizer performs memory interleaving with accurate
+; cost estimations.
+
+
+; Simple case where just the load is interleaved, because the store group
+; would have gaps.
+define void @fun0(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Creating an interleave group with:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+;        (vl; vl; vperm)
+}
+
+; Interleaving of both load and stores.
+define void @fun1(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %i_1  = add i64 %i, 1
+  %tmp2 = getelementptr inbounds i32, i32* %data, i64 %i_1
+  %tmp3 = load i32, i32* %tmp2, align 4
+  store i32 %tmp1, i32* %tmp2, align 4
+  store i32 %tmp3, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Creating an interleave group with:  store i32 %tmp3, i32* %tmp0, align 4
+; CHECK: LV: Inserted:  store i32 %tmp1, i32* %tmp2, align 4
+; CHECK:     into the interleave group with  store i32 %tmp3, i32* %tmp0, align 4
+; CHECK: LV: Creating an interleave group with:  %tmp3 = load i32, i32* %tmp2, align 4
+; CHECK: LV: Inserted:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK:     into the interleave group with  %tmp3 = load i32, i32* %tmp2, align 4
+
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %tmp3 = load i32, i32* %tmp2, align 4
+;            (vl; vl; vperm, vpkg)
+
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   store i32 %tmp1, i32* %tmp2, align 4
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp3, i32* %tmp0, align 4
+;            (vmrlf; vmrhf; vst; vst)
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/SystemZ/pr38110.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/SystemZ/pr38110.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/SystemZ/pr38110.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/SystemZ/pr38110.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,50 @@
+; RUN: opt -passes='loop-vectorize' -mcpu=z13 -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Forcing VF=2 to trigger vector code gen
+;
+; This is a test case to exercise more cases in truncateToMinimalBitWidths().
+; Test passes if vector code is generated w/o hitting llvm_unreachable().
+;
+; Performing minimal check in the output to ensure the loop is actually
+; vectorized.
+;
+; CHECK: vector.body
+
+target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+target triple = "s390x-ibm-linux"
+
+define void @test(i32 zeroext %width, i8* nocapture %row, i16 zeroext %src, i16* nocapture readonly %dst) {
+entry:
+  %cmp10 = icmp eq i32 %width, 0
+  br i1 %cmp10, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = zext i16 %src to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %sp.011 = phi i8* [ %row, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ]
+  %0 = load i8, i8* %sp.011, align 1
+  %conv = zext i8 %0 to i32
+  %cmp2 = icmp eq i32 %conv, %conv1
+  br i1 %cmp2, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i16, i16* %dst, align 2
+  %conv4 = trunc i16 %1 to i8
+  store i8 %conv4, i8* %sp.011, align 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw i32 %i.012, 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %sp.011, i64 1
+  %exitcond = icmp eq i32 %inc, %width
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/already-vectorized.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/already-vectorized.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/already-vectorized.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/already-vectorized.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; RUN: opt < %s -disable-loop-unrolling -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+; We want to make sure that we don't even try to vectorize loops again
+; The vectorizer used to mark the un-vectorized loop only as already vectorized
+; thus, trying to vectorize the vectorized loop again
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at a = external global [255 x i32]
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @vect() {
+; CHECK: LV: Checking a loop in "vect"
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; We need to make sure we did vectorize the loop
+; CHECK: LV: Found a loop: for.body
+; CHECK: LV: We can vectorize this loop!
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %red.05
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+; If it did, we have two loops:
+; CHECK: vector.body:
+; CHECK: br {{.*}} label %vector.body, !llvm.loop [[vect:![0-9]+]]
+; CHECK: for.body:
+; CHECK: br {{.*}} label %for.body, !llvm.loop [[scalar:![0-9]+]]
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add
+}
+
+; Now, we check for the Hint metadata
+; CHECK: [[vect]] = distinct !{[[vect]], [[width:![0-9]+]]}
+; CHECK: [[width]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[scalar]] = distinct !{[[scalar]], [[runtime_unroll:![0-9]+]], [[width]]}
+; CHECK: [[runtime_unroll]] = !{!"llvm.loop.unroll.runtime.disable"}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/assume.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/assume.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/assume.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/assume.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,100 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: for.body:
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+02
+  tail call void @llvm.assume(i1 %cmp1)
+  %add = fadd float %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #1
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+
+%struct.data = type { float*, float* }
+
+; Function Attrs: nounwind uwtable
+define void @test2(%struct.data* nocapture readonly %d) #0 {
+entry:
+  %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
+  %0 = load float*, float** %b, align 8
+  %ptrint = ptrtoint float* %0 to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  %a = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 0
+  %1 = load float*, float** %a, align 8
+  %ptrint2 = ptrtoint float* %1 to i64
+  %maskedptr3 = and i64 %ptrint2, 31
+  %maskcond4 = icmp eq i64 %maskedptr3, 0
+  br label %for.body
+
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: for.body:
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds float, float* %0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx, align 4
+  %add = fadd float %2, 1.000000e+00
+  tail call void @llvm.assume(i1 %maskcond4)
+  %arrayidx5 = getelementptr inbounds float, float* %1, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/avx1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/avx1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/avx1.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/avx1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,52 @@
+; RUN: opt < %s  -loop-vectorize -mattr=avx,+slow-unaligned-mem-32 -S | FileCheck %s --check-prefix=SLOWMEM32 --check-prefix=CHECK
+; RUN: opt < %s  -loop-vectorize -mattr=avx,-slow-unaligned-mem-32 -S | FileCheck %s --check-prefix=FASTMEM32 --check-prefix=CHECK
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: @read_mod_write_single_ptr(
+; CHECK: load <8 x float>
+; CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %3 = load float, float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+; CHECK-LABEL: @read_mod_i64(
+; SLOWMEM32: load <2 x i64>
+; FASTMEM32: load <4 x i64>
+; CHECK: ret i32
+define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 4
+  %4 = add i64 %3, 3
+  store i64 %4, i64* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/avx512.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/avx512.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/avx512.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,112 @@
+; RUN: opt -mattr=+avx512f --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s
+; RUN: opt -mattr=+avx512vl,+prefer-256-bit --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s --check-prefix=CHECK-PREFER-AVX256
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Verify that we generate 512-bit wide vectors for a basic integer memset
+; loop.
+
+; CHECK-LABEL: f:
+; CHECK: vmovdqu64 %zmm{{.}},
+; CHECK-NOT: %ymm
+
+; Verify that we don't generate 512-bit wide vectors when subtarget feature says not to
+
+; CHECK-PREFER-AVX256-LABEL: f:
+; CHECK-PREFER-AVX256: vmovdqu %ymm{{.}},
+; CHECK-PREFER-AVX256-NOT: %zmm
+
+define void @f(i32* %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %n, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Verify that the "prefer-vector-width=256" attribute prevents the use of 512-bit
+; vectors
+
+; CHECK-LABEL: g:
+; CHECK: vmovdqu %ymm{{.}},
+; CHECK-NOT: %zmm
+
+; CHECK-PREFER-AVX256-LABEL: g:
+; CHECK-PREFER-AVX256: vmovdqu %ymm{{.}},
+; CHECK-PREFER-AVX256-NOT: %zmm
+
+define void @g(i32* %a, i32 %n) "prefer-vector-width"="256" {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %n, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Verify that the "prefer-vector-width=512" attribute override the subtarget
+; vectors
+
+; CHECK-LABEL: h:
+; CHECK: vmovdqu64 %zmm{{.}},
+; CHECK-NOT: %ymm
+
+; CHECK-PREFER-AVX256-LABEL: h:
+; CHECK-PREFER-AVX256: vmovdqu64 %zmm{{.}},
+; CHECK-PREFER-AVX256-NOT: %ymm
+
+define void @h(i32* %a, i32 %n) "prefer-vector-width"="512" {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %n, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,108 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR34965/D39346
+
+; LV retains the original scalar loop intact as remainder loop. However,
+; after this transformation, analysis information concerning the remainder
+; loop may differ from the original scalar loop. This test is an example of
+; that behaviour, where values inside the remainder loop which SCEV could
+; originally analyze now require flow-sensitive analysis currently not
+; supported in SCEV. In particular, during LV code generation, after turning
+; the original scalar loop into the remainder loop, LV expected
+; Legal->isConsecutivePtr() to be consistent and return the same output as
+; during legal/cost model phases (original scalar loop). Unfortunately, that
+; condition was not satisfied because of the aforementioned SCEV limitation.
+; After D39346, LV code generation doesn't rely on Legal->isConsecutivePtr(),
+; i.e., SCEV. This test verifies that LV is able to handle the described cases.
+;
+; TODO: The SCEV limitation described before may affect plans to further
+; optimize the remainder loop of this particular test case. One tentative
+; solution is to detect the problematic IVs in LV (%7 and %8) and perform an
+; in-place IV optimization by replacing:
+;   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] with
+; with
+;   %8 = sub i32 %7, 1.
+
+
+; Verify that store is vectorized as stride-1 memory access.
+
+; CHECK-LABEL: @test_01(
+; CHECK-NOT: vector.body:
+
+; This test was originally vectorized, but now SCEV is smart enough to prove
+; that its trip count is 1, so it gets ignored by vectorizer.
+; Function Attrs: uwtable
+define void @test_01() {
+  br label %.outer
+
+; <label>:1:                                      ; preds = %2
+  ret void
+
+; <label>:2:                                      ; preds = %._crit_edge.loopexit
+  %3 = add nsw i32 %.ph, -2
+  br i1 undef, label %1, label %.outer
+
+.outer:                                           ; preds = %2, %0
+  %.ph = phi i32 [ %3, %2 ], [ 336, %0 ]
+  %.ph2 = phi i32 [ 62, %2 ], [ 110, %0 ]
+  %4 = and i32 %.ph, 30
+  %5 = add i32 %.ph2, 1
+  br label %6
+
+; <label>:6:                                      ; preds = %6, %.outer
+  %7 = phi i32 [ %5, %.outer ], [ %13, %6 ]
+  %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
+  %9 = add i32 %8, 2
+  %10 = zext i32 %9 to i64
+  %11 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %10
+  %12 = ashr i32 undef, %4
+  store i32 %12, i32 addrspace(1)* %11, align 4
+  %13 = add i32 %7, 1
+  %14 = icmp sgt i32 %13, 61
+  br i1 %14, label %._crit_edge.loopexit, label %6
+
+._crit_edge.loopexit:                             ; preds = %._crit_edge.loopexit, %6
+  br i1 undef, label %2, label %._crit_edge.loopexit
+}
+
+; After trip count is increased, the test gets vectorized.
+; CHECK-LABEL: @test_02(
+; CHECK: vector.body:
+; CHECK: store <4 x i32>
+
+; Function Attrs: uwtable
+define void @test_02() {
+  br label %.outer
+
+; <label>:1:                                      ; preds = %2
+  ret void
+
+; <label>:2:                                      ; preds = %._crit_edge.loopexit
+  %3 = add nsw i32 %.ph, -2
+  br i1 undef, label %1, label %.outer
+
+.outer:                                           ; preds = %2, %0
+  %.ph = phi i32 [ %3, %2 ], [ 336, %0 ]
+  %.ph2 = phi i32 [ 62, %2 ], [ 110, %0 ]
+  %4 = and i32 %.ph, 30
+  %5 = add i32 %.ph2, 1
+  br label %6
+
+; <label>:6:                                      ; preds = %6, %.outer
+  %7 = phi i32 [ %5, %.outer ], [ %13, %6 ]
+  %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
+  %9 = add i32 %8, 2
+  %10 = zext i32 %9 to i64
+  %11 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %10
+  %12 = ashr i32 undef, %4
+  store i32 %12, i32 addrspace(1)* %11, align 4
+  %13 = add i32 %7, 1
+  %14 = icmp sgt i32 %13, 610
+  br i1 %14, label %._crit_edge.loopexit, label %6
+
+._crit_edge.loopexit:                             ; preds = %._crit_edge.loopexit, %6
+  br i1 undef, label %2, label %._crit_edge.loopexit
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,67 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: PR31671
+;
+; Check a pointer in which one of its uses is consecutive-like and another of
+; its uses is non-consecutive-like. In the test case below, %tmp3 is the
+; pointer operand of an interleaved load, making it consecutive-like. However,
+; it is also the pointer operand of a non-interleaved store that will become a
+; scatter operation. %tmp3 (and the induction variable) should not be marked
+; uniform-after-vectorization.
+;
+; CHECK:       LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
+; CHECK-NOT:   LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
+; CHECK-NOT:   LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NOT:   LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> undef, float %x, i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 5, i64 10, i64 15, i64 20, i64 25, i64 30, i64 35, i64 40, i64 45, i64 50, i64 55, i64 60, i64 65, i64 70, i64 75>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <80 x float>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <80 x float>, <80 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <16 x float*> [[TMP3]] to <16 x <80 x float>*>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x <80 x float>*> [[BC]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+%data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] }
+
+define void @PR31671(float %x, %data* %d) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
+  %tmp1 = load float, float* %tmp0, align 4
+  %tmp2 = fmul float %x, %tmp1
+  %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
+  %tmp4 = load float, float* %tmp3, align 4
+  %tmp5 = fadd float %tmp4, %tmp2
+  store float %tmp5, float* %tmp3, align 4
+  %i.next = add nuw nsw i64 %i, 5
+  %cond = icmp slt i64 %i.next, 32000
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="knl" }

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/constant-fold.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/constant-fold.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/constant-fold.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/constant-fold.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mtriple=x86_64-- -o - %s | FileCheck %s
+
+; Testcase that verify that we don't get a faulty bitcast that cast between
+; different sizes.
+
+%rec8 = type { i16 }
+
+ at a = global [1 x %rec8] zeroinitializer
+ at b = global [2 x i16*] zeroinitializer
+
+
+define void @f1() {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> undef, i16 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], <i16 0, i16 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>*
+; CHECK-NEXT:    store <2 x i16*> <i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+
+bb1:
+  br label %bb2
+
+bb2:
+  %c.1.0 = phi i16 [ 0, %bb1 ], [ %_tmp9, %bb2 ]
+  %_tmp1 = zext i16 0 to i64
+  %_tmp2 = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 %_tmp1
+  %_tmp4 = bitcast %rec8* %_tmp2 to i16*
+  %_tmp6 = sext i16 %c.1.0 to i64
+  %_tmp7 = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 %_tmp6
+  store i16* %_tmp4, i16** %_tmp7
+  %_tmp9 = add nsw i16 %c.1.0, 1
+  %_tmp11 = icmp slt i16 %_tmp9, 2
+  br i1 %_tmp11, label %bb2, label %bb3
+
+bb3:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -loop-vectorize -dce -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at B = common global [1024 x i32] zeroinitializer, align 16
+ at A = common global [1024 x i32] zeroinitializer, align 16
+
+; We use to not vectorize this loop because the shift was deemed to expensive.
+; Now that we differentiate shift cost base on the operand value kind, we will
+; vectorize this loop.
+; CHECK: ashr <4 x i32>
+define void @f() {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %shl = ashr i32 %0, 3
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  store i32 %shl, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/conversion-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/conversion-cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/conversion-cost.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/conversion-cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,47 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK-LABEL: @conversion_cost1(
+;CHECK: store <32 x i8>
+;CHECK: ret
+define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 3
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ]
+  %2 = trunc i64 %indvars.iv to i8
+  %3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  store i8 %2, i8* %3, align 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+;CHECK-LABEL: @conversion_cost2(
+;CHECK: <2 x float>
+;CHECK: ret
+define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %add = add nsw i64 %indvars.iv, 3
+  %tofp = sitofp i64 %add to float
+  %gep = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  store float %tofp, float* %gep, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,82 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+ at c = common global [2048 x i32] zeroinitializer, align 16
+ at b = common global [2048 x i32] zeroinitializer, align 16
+ at d = common global [2048 x i32] zeroinitializer, align 16
+ at a = common global [2048 x i32] zeroinitializer, align 16
+
+; The program below gathers and scatters data. We better not vectorize it.
+;CHECK-LABEL: @cost_model_1(
+;CHECK-NOT: <2 x i32>
+;CHECK-NOT: <4 x i32>
+;CHECK-NOT: <8 x i32>
+;CHECK: ret void
+define void @cost_model_1() nounwind uwtable noinline ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 8
+  %idxprom1 = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %idxprom1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx4, align 4
+  %idxprom5 = sext i32 %3 to i64
+  %arrayidx6 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %idxprom5
+  store i32 %2, i32* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; This function uses a stride that is generally too big to benefit from vectorization without
+; really good support for a gather load. We were not computing an accurate cost for the 
+; vectorization and subsequent scalarization of the pointer induction variables.
+
+define float @PR27826(float* nocapture readonly %a, float* nocapture readonly %b, i32 %n) {
+; CHECK-LABEL: @PR27826(
+; CHECK-NOT:   <4 x float> 
+; CHECK-NOT:   <8 x float> 
+; CHECK:       ret float %s.0.lcssa
+
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %preheader, label %for.end
+
+preheader:
+  %t0 = sext i32 %n to i64
+  br label %for
+
+for:
+  %indvars.iv = phi i64 [ 0, %preheader ], [ %indvars.iv.next, %for ]
+  %s.02 = phi float [ 0.0, %preheader ], [ %add4, %for ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %t1 = load float, float* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %t2 = load float, float* %arrayidx3, align 4
+  %add = fadd fast float %t1, %s.02
+  %add4 = fadd fast float %add, %t2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 32
+  %cmp1 = icmp slt i64 %indvars.iv.next, %t0
+  br i1 %cmp1, label %for, label %loopexit
+
+loopexit:
+  %add4.lcssa = phi float [ %add4, %for ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi float [ 0.0, %entry ], [ %add4.lcssa, %loopexit ]
+  ret float %s.0.lcssa
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/float-induction-x86.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/float-induction-x86.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/float-induction-x86.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,149 @@
+; RUN: opt < %s  -O3 -simplifycfg -keep-loops=false -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix AUTO_VEC %s
+
+; This test checks auto-vectorization with FP induction variable.
+; The FP operation is not "fast" and requires "fast-math" function attribute.
+
+;void fp_iv_loop1(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+
+; AUTO_VEC-LABEL: @fp_iv_loop1(
+; AUTO_VEC: vector.body
+; AUTO_VEC: store <8 x float>
+
+define void @fp_iv_loop1(float* noalias nocapture %A, i32 %N) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; The same as the previous, FP operation is not fast, different function attribute
+; Vectorization should be rejected.
+;void fp_iv_loop2(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+; AUTO_VEC-LABEL: @fp_iv_loop2(
+; AUTO_VEC-NOT: vector.body
+; AUTO_VEC-NOT: store <{{.*}} x float>
+
+define void @fp_iv_loop2(float* noalias nocapture %A, i32 %N) #1 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; AUTO_VEC-LABEL: @external_use_with_fast_math(
+; AUTO_VEC-NEXT:  entry:
+; AUTO_VEC-NEXT:    [[TMP0:%.*]] = icmp sgt i64 %n, 1
+; AUTO_VEC-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 %n, i64 1
+; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %vector.ph
+; AUTO_VEC:       vector.ph:
+; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; AUTO_VEC:         br label %vector.body
+; AUTO_VEC:       middle.block:
+; AUTO_VEC:         [[TMP11:%.*]] = add nsw i64 [[N_VEC]], -1
+; AUTO_VEC-NEXT:    [[CAST_CMO:%.*]] = sitofp i64 [[TMP11]] to double
+; AUTO_VEC-NEXT:    [[TMP12:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00
+; AUTO_VEC-NEXT:    br i1 {{.*}}, label %for.end, label %for.body
+; AUTO_VEC:       for.end:
+; AUTO_VEC-NEXT:    [[J_LCSSA:%.*]] = phi double [ [[TMP12]], %middle.block ], [ %j, %for.body ]
+; AUTO_VEC-NEXT:    ret double [[J_LCSSA]]
+;
+define double @external_use_with_fast_math(double* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [%i.next, %for.body]
+  %j = phi double [ 0.0, %entry ], [ %j.next, %for.body ]
+  %tmp0 = getelementptr double, double* %a, i64 %i
+  store double %j, double* %tmp0
+  %i.next = add i64 %i, 1
+  %j.next = fadd fast double %j, 3.0
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp1 = phi double [ %j, %for.body ]
+  ret double %tmp1
+}
+
+; AUTO_VEC-LABEL: @external_use_without_fast_math(
+; AUTO_VEC:       for.body:
+; AUTO_VEC:         [[J:%.*]] = phi double [ 0.000000e+00, %entry ], [ [[J_NEXT:%.*]], %for.body ]
+; AUTO_VEC:         [[J_NEXT]] = fadd double [[J]], 3.000000e+00
+; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %for.end
+; AUTO_VEC:       for.end:
+; AUTO_VEC-NEXT:    ret double [[J]]
+;
+define double @external_use_without_fast_math(double* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [%i.next, %for.body]
+  %j = phi double [ 0.0, %entry ], [ %j.next, %for.body ]
+  %tmp0 = getelementptr double, double* %a, i64 %i
+  store double %j, double* %tmp0
+  %i.next = add i64 %i, 1
+  %j.next = fadd double %j, 3.0
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp1 = phi double [ %j, %for.body ]
+  ret double %tmp1
+}
+
+attributes #0 = { "no-nans-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="false" }

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/force-ifcvt.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/force-ifcvt.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/force-ifcvt.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,42 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @Test(i32* nocapture %res, i32* nocapture readnone %c, i32* nocapture readonly %d, i32* nocapture readonly %p) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @Test
+; CHECK: <4 x i32>
+
+for.body:                                         ; preds = %cond.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %cond.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !1
+  %cmp1 = icmp eq i32 %0, 0
+  %arrayidx3 = getelementptr inbounds i32, i32* %res, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx3, align 4, !llvm.access.group !1
+  br i1 %cmp1, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %for.body
+  %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx7, align 4, !llvm.access.group !1
+  %add = add nsw i32 %2, %1
+  br label %cond.end
+
+cond.end:                                         ; preds = %for.body, %cond.false
+  %cond = phi i32 [ %add, %cond.false ], [ %1, %for.body ]
+  store i32 %cond, i32* %arrayidx3, align 4, !llvm.access.group !1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %cond.end
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
+
+!0 = distinct !{!0, !{!"llvm.loop.parallel_accesses", !1}}
+!1 = distinct !{}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+ at float_array = common global [10000 x float] zeroinitializer, align 16
+ at unsigned_array = common global [10000 x i32] zeroinitializer, align 16
+
+; If we need to scalarize the fptoui and then use inserts to build up the
+; vector again, then there is certainly no value in going 256-bit wide.
+; CHECK-NOT: vinserti128
+
+define void @convert(i32 %N) {
+entry:
+  %0 = icmp eq i32 %N, 0
+  br i1 %0, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [10000 x float], [10000 x float]* @float_array, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %conv = fptoui float %1 to i32
+  %arrayidx2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+ at n = global i32 10000, align 4
+ at double_array = common global [10000 x double] zeroinitializer, align 16
+ at unsigned_array = common global [10000 x i32] zeroinitializer, align 16
+
+; If we need to scalarize the fptoui and then use inserts to build up the
+; vector again, then there is certainly no value in going 256-bit wide.
+; CHECK-NOT: vpinsrd
+
+define void @convert() {
+entry:
+  %0 = load i32, i32* @n, align 4
+  %cmp4 = icmp eq i32 %0, 0
+  br i1 %cmp4, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [10000 x double], [10000 x double]* @double_array, i64 0, i64 %indvars.iv
+  %1 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %1 to i32
+  %arrayidx2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp ult i32 %2, %0
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+
+; CHECK: cost of 7 for VF 8 For instruction:   %conv = fptosi float %tmp to i8
+define void @float_to_sint8_cost(i8* noalias nocapture %a, float* noalias nocapture readonly %b) nounwind {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %tmp = load float, float* %arrayidx, align 4
+  %conv = fptosi float %tmp to i8
+  %arrayidx2 = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/funclet.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/funclet.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/funclet.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/funclet.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+define void @test1() #0 personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @_CxxThrowException(i8* null, i8* null)
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  catchret from %1 to label %try.cont
+
+for.body:                                         ; preds = %for.body, %catch
+  %i.07 = phi i32 [ 0, %catch ], [ %inc, %for.body ]
+  %call = call double @floor(double 1.0) #1 [ "funclet"(token %1) ]
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+try.cont:                                         ; preds = %for.cond.cleanup
+  ret void
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+; CHECK-LABEL: define void @test1(
+; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null]
+; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
+
+declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare double @floor(double) #1
+
+attributes #0 = { "target-features"="+sse2" }
+attributes #1 = { nounwind readnone }

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/gather-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/gather-cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/gather-cost.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/gather-cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,86 @@
+; RUN: opt -loop-vectorize -mtriple=x86_64-apple-macosx -S -mcpu=corei7-avx -enable-interleaved-mem-accesses=false < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+ at kernel = global [512 x float] zeroinitializer, align 16
+ at kernel2 = global [512 x float] zeroinitializer, align 16
+ at kernel3 = global [512 x float] zeroinitializer, align 16
+ at kernel4 = global [512 x float] zeroinitializer, align 16
+ at src_data = global [1536 x float] zeroinitializer, align 16
+ at r_ = global i8 0, align 1
+ at g_ = global i8 0, align 1
+ at b_ = global i8 0, align 1
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive. This function represents a point where vectorization starts to
+; become beneficial.
+; Make sure we are conservative and don't vectorize it.
+; CHECK-NOT: x float>
+
+define void @_Z4testmm(i64 %size, i64 %offset) {
+entry:
+  %cmp53 = icmp eq i64 %size, 0
+  br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+  %add = add i64 %v.055, %offset
+  %mul = mul i64 %add, 3
+  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %mul
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 %v.055
+  %1 = load float, float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %1
+  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 %v.055
+  %2 = load float, float* %arrayidx4, align 4
+  %mul5 = fmul fast float %mul3, %2
+  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 %v.055
+  %3 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul fast float %mul5, %3
+  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 %v.055
+  %4 = load float, float* %arrayidx8, align 4
+  %mul9 = fmul fast float %mul7, %4
+  %add10 = fadd fast float %r.057, %mul9
+  %arrayidx.sum = add i64 %mul, 1
+  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
+  %5 = load float, float* %arrayidx11, align 4
+  %mul13 = fmul fast float %1, %5
+  %mul15 = fmul fast float %2, %mul13
+  %mul17 = fmul fast float %3, %mul15
+  %mul19 = fmul fast float %4, %mul17
+  %add20 = fadd fast float %g.056, %mul19
+  %arrayidx.sum52 = add i64 %mul, 2
+  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
+  %6 = load float, float* %arrayidx21, align 4
+  %mul23 = fmul fast float %1, %6
+  %mul25 = fmul fast float %2, %mul23
+  %mul27 = fmul fast float %3, %mul25
+  %mul29 = fmul fast float %4, %mul27
+  %add30 = fadd fast float %b.054, %mul29
+  %inc = add i64 %v.055, 1
+  %exitcond = icmp ne i64 %inc, %size
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  %add30.lcssa = phi float [ %add30, %for.body ]
+  %add20.lcssa = phi float [ %add20, %for.body ]
+  %add10.lcssa = phi float [ %add10, %for.body ]
+  %phitmp = fptoui float %add10.lcssa to i8
+  %phitmp60 = fptoui float %add20.lcssa to i8
+  %phitmp61 = fptoui float %add30.lcssa to i8
+  br label %for.end
+
+for.end:
+  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  store i8 %r.0.lcssa, i8* @r_, align 1
+  store i8 %g.0.lcssa, i8* @g_, align 1
+  store i8 %b.0.lcssa, i8* @b_, align 1
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,41 @@
+; RUN: opt -loop-vectorize -S -mattr=avx512f  < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks that "gather" operation is choosen since it's cost is better
+; than interleaving pattern.
+;
+;unsigned long A[SIZE];
+;unsigned long B[SIZE];
+;
+;void foo() {
+;  for (int i=0; i<N; i+=8) {
+;    B[i] = A[i] + 5;
+;  }
+;}
+
+ at A = global [10240 x i64] zeroinitializer, align 16
+ at B = global [10240 x i64] zeroinitializer, align 16
+
+
+; CHECK_LABEL: strided_load_i64
+; CHECK: masked.gather
+define void @strided_load_i64() {
+  br label %1
+
+; <label>:1:                                      ; preds = %0, %1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 16
+  %4 = add i64 %3, 5
+  %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %4, i64* %5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
+  %6 = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %6, label %1, label %7
+
+; <label>:7:                                      ; preds = %1
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,1754 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
+; RUN: opt < %s -O3 -mcpu=knl -force-vector-width=2 -S | FileCheck %s -check-prefix=FVW2
+
+; With a force-vector-width, it is sometimes more profitable to generate
+; scalarized and predicated stores instead of masked scatter.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc_linux"
+
+; The source code:
+;
+;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
+;
+;  for (int i=0; i < SIZE; ++i) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[index[i]] + (float) 0.5;
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+; AVX512-LABEL: @foo1(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
+; AVX512-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP5:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> [[TMP2]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP7:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
+; AVX512-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]])
+; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 16
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP14]], i32 4, <16 x i1> [[TMP12]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP15:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_1]] to <16 x i64>
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP15]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP16]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP17:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]])
+; AVX512-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 32
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], align 4
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
+; AVX512-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP24]], i32 4, <16 x i1> [[TMP22]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP25:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_2]] to <16 x i64>
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP25]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP26]], i32 4, <16 x i1> [[TMP22]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP22]])
+; AVX512-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 48
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], align 4
+; AVX512-NEXT:    [[TMP32:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
+; AVX512-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP34]], i32 4, <16 x i1> [[TMP32]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP35:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_3]] to <16 x i64>
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP35]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP36]], i32 4, <16 x i1> [[TMP32]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], i32 4, <16 x i1> [[TMP32]])
+; AVX512-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 64
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
+; AVX512-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo1(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; FVW2-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP4]], i32 4, <2 x i1> [[TMP2]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
+; FVW2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP5]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP6]], i32 4, <2 x i1> [[TMP2]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP7]], <2 x float>* [[TMP9]], i32 4, <2 x i1> [[TMP2]])
+; FVW2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 2
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP11]], align 4
+; FVW2-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_1]], zeroinitializer
+; FVW2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP14]], i32 4, <2 x i1> [[TMP12]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP15:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_1]] to <2 x i64>
+; FVW2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP15]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP16]], i32 4, <2 x i1> [[TMP12]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP17:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP17]], <2 x float>* [[TMP19]], i32 4, <2 x i1> [[TMP12]])
+; FVW2-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
+; FVW2-NEXT:    [[TMP22:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_2]], zeroinitializer
+; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_2:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP24]], i32 4, <2 x i1> [[TMP22]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP25:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_2]] to <2 x i64>
+; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP25]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP22]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP27:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP27]], <2 x float>* [[TMP29]], i32 4, <2 x i1> [[TMP22]])
+; FVW2-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 6
+; FVW2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP31]], align 4
+; FVW2-NEXT:    [[TMP32:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_3]], zeroinitializer
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_3:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP34]], i32 4, <2 x i1> [[TMP32]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP35:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_3]] to <2 x i64>
+; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP35]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP36]], i32 4, <2 x i1> [[TMP32]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP37:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP37]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP32]])
+; FVW2-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
+; FVW2-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca float*, align 8
+  %out.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store float* %in, float** %in.addr, align 8
+  store float* %out, float** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32*, i32** %index.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %idxprom4 = sext i32 %6 to i64
+  %7 = load float*, float** %in.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4
+  %8 = load float, float* %arrayidx5, align 4
+  %add = fadd float %8, 5.000000e-01
+  %9 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %9 to i64
+  %10 = load float*, float** %out.addr, align 8
+  %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6
+  store float %add, float* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %11 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code
+;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
+;
+;  for (int i=0; i<SIZE; i += 16) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[i].b + (float) 0.5;
+;    }
+;  }
+;}
+
+%struct.In = type { float, float }
+
+define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
+; AVX512-LABEL: @foo2(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !2
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In*, align 8
+  %out.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In* %in, %struct.In** %in.addr, align 8
+  store float* %out, float** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In*, %struct.In** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
+  %6 = load float, float* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float*, float** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4
+  store float %add, float* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code
+;struct Out {
+;  float a;
+;  float b;
+;};
+;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
+;
+;  for (int i=0; i<SIZE; i += 16) {
+;    if (trigger[i] > 0) {
+;      out[i].b = in[i].b + (float) 0.5;
+;    }
+;  }
+;}
+
+%struct.Out = type { float, float }
+
+define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) {
+; AVX512-LABEL: @foo3(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo3(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE29:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE29]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD6:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD6]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER9]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD6]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER12]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; FVW2:       pred.store.if16:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP20]], i32 1
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; FVW2:       pred.store.continue17:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; FVW2:       pred.store.if18:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP24]], i32 1
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; FVW2:       pred.store.continue19:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; FVW2:       pred.store.if20:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP28]], i32 1
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; FVW2:       pred.store.continue21:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; FVW2:       pred.store.if22:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP32]], i32 1
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; FVW2:       pred.store.continue23:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; FVW2:       pred.store.if24:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP36]], i32 1
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; FVW2:       pred.store.continue25:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; FVW2:       pred.store.if26:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP40]], i32 1
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; FVW2:       pred.store.continue27:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29]]
+; FVW2:       pred.store.if28:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP44]], i32 1
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE29]]
+; FVW2:       pred.store.continue29:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !3
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In*, align 8
+  %out.addr = alloca %struct.Out*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In* %in, %struct.In** %in.addr, align 8
+  store %struct.Out* %out, %struct.Out** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In*, %struct.In** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
+  %6 = load float, float* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load %struct.Out*, %struct.Out** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4
+  %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1
+  store float %add, float* %b6, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
+
+; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
+
+define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
+; AVX512-LABEL: @foo2_addrspace(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2_addrspace(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the input has the non-default address space.
+
+define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+; AVX512-LABEL: @foo2_addrspace2(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2_addrspace2(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(0)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(0)* %out, float addrspace(0)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(0)*, float addrspace(0)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(0)* %8, i64 %idxprom4
+  store float %add, float addrspace(0)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the output has the non-default address space.
+
+define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+; AVX512-LABEL: @foo2_addrspace3(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2_addrspace3(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In addrspace(0)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(0)* %in, %struct.In addrspace(0)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(0)*, %struct.In addrspace(0)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(0)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/gcc-examples.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/gcc-examples.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/gcc-examples.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/gcc-examples.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,77 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-interleave=0 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+ at b = common global [2048 x i32] zeroinitializer, align 16
+ at c = common global [2048 x i32] zeroinitializer, align 16
+ at a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK-LABEL: @example1(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+
+;UNROLL-LABEL: @example1(
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
+;CHECK-LABEL: @example10b(
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example10b(
+;UNROLL: load <4 x i16>
+;UNROLL: load <4 x i16>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+  %3 = load i16, i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR15794
+; incorrect addition of llvm.mem.parallel_loop_access metadata is undefined
+; behaviour. Vectorizer ignores the memory dependency checks and goes ahead and
+; vectorizes this loop with uniform stores which has an output dependency.
+
+; void foo(int *a, int *b, int k, int m) {
+;   for (int i = 0; i < m; i++) {
+;     for (int j = 0; j < m; j++) {
+;       a[i] = a[i + j + k] + 1; <<<
+;     }
+;     b[i] = b[i] + 3;
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]]
+; CHECK:       for.body3.lr.ph.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
+; CHECK:       for.end.us:
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP4]], 3
+; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
+; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND36]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY3_LR_PH_US]], !llvm.loop !2
+; CHECK:       for.body3.us:
+; CHECK-NEXT:    [[INDVARS_IV29:%.*]] = phi i64 [ [[BC_RESUME_VAL:%.*]], [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
+; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP5]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP6]], 1
+; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
+; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3
+; CHECK:       for.body3.lr.ph.us:
+; CHECK-NEXT:    [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]]
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
+; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP9]], [[K]]
+; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP8]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 false, i1 [[TMP12]], i1 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 false, [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[ADD_US]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP24]], i32 0
+; CHECK-NEXT:    store i32 [[TMP25]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP24]], i32 1
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP24]], i32 2
+; CHECK-NEXT:    store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP24]], i32 3
+; CHECK-NEXT:    store i32 [[TMP28]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY3_US]]
+; CHECK:       for.end15.loopexit:
+; CHECK-NEXT:    br label [[FOR_END15]]
+; CHECK:       for.end15:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp27 = icmp sgt i32 %m, 0
+  br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
+
+for.end.us:                                       ; preds = %for.body3.us
+  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
+  %0 = load i32, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
+  %add10.us = add nsw i32 %0, 3
+  store i32 %add10.us, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
+  br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
+  %1 = trunc i64 %indvars.iv29 to i32
+  %add4.us = add i32 %add.us, %1
+  %idxprom.us = sext i32 %add4.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
+  %2 = load i32, i32* %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3
+  %add5.us = add nsw i32 %2, 1
+  store i32 %add5.us, i32* %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
+  br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4
+
+for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
+  %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
+  %3 = trunc i64 %indvars.iv33 to i32
+  %add.us = add i32 %3, %k
+  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
+  br label %for.body3.us
+
+for.end15:                                        ; preds = %for.end.us, %entry
+  ret void
+}
+
+; Same test as above, but without the invalid parallel_loop_access metadata.
+
+; Here we can see the vectorizer does the mem dep checks and decides it is
+; unsafe to vectorize.
+define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+; CHECK-LABEL: @no-par-mem-metadata(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]]
+; CHECK:       for.body3.lr.ph.us.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
+; CHECK:       for.end.us:
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP0]], 3
+; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
+; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND36]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY3_LR_PH_US]], !llvm.loop !2
+; CHECK:       for.body3.us:
+; CHECK-NEXT:    [[INDVARS_IV29:%.*]] = phi i64 [ 0, [[FOR_BODY3_LR_PH_US]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
+; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
+; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !1
+; CHECK:       for.body3.lr.ph.us:
+; CHECK-NEXT:    [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
+; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP3]], [[K:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
+; CHECK-NEXT:    br label [[FOR_BODY3_US]]
+; CHECK:       for.end15.loopexit:
+; CHECK-NEXT:    br label [[FOR_END15]]
+; CHECK:       for.end15:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp27 = icmp sgt i32 %m, 0
+  br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
+
+for.end.us:                                       ; preds = %for.body3.us
+  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
+  %0 = load i32, i32* %arrayidx9.us, align 4
+  %add10.us = add nsw i32 %0, 3
+  store i32 %add10.us, i32* %arrayidx9.us, align 4
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
+  br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
+  %1 = trunc i64 %indvars.iv29 to i32
+  %add4.us = add i32 %add.us, %1
+  %idxprom.us = sext i32 %add4.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
+  %2 = load i32, i32* %arrayidx.us, align 4
+  %add5.us = add nsw i32 %2, 1
+  store i32 %add5.us, i32* %arrayidx7.us, align 4
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
+  br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4
+
+for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
+  %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
+  %3 = trunc i64 %indvars.iv33 to i32
+  %add.us = add i32 %3, %k
+  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
+  br label %for.body3.us
+
+for.end15:                                        ; preds = %for.end.us, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!3 = !{!4, !5}
+!4 = !{!4}
+!5 = !{!5}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin %s | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin -mattr=+avx %s | FileCheck %s --check-prefixes=CHECK,AVX
+
+; Two mostly identical functions. The only difference is the presence of
+; fast-math flags on the second. The loop is a pretty simple reduction:
+
+; for (int i = 0; i < 32; ++i)
+;   if (arr[i] != 42)
+;     tot += arr[i];
+
+define double @sumIfScalar(double* nocapture readonly %arr) {
+; CHECK-LABEL: @sumIfScalar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; CHECK-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; CHECK-NEXT:    [[TST:%.*]] = fcmp une double [[NEXTVAL]], 4.200000e+01
+; CHECK-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
+; CHECK:       do.add:
+; CHECK-NEXT:    [[TOT_NEW:%.*]] = fadd double [[TOT]], [[NEXTVAL]]
+; CHECK-NEXT:    br label [[NEXT_ITER]]
+; CHECK:       no.add:
+; CHECK-NEXT:    br label [[NEXT_ITER]]
+; CHECK:       next.iter:
+; CHECK-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
+; CHECK-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
+; CHECK:       done:
+; CHECK-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
+; CHECK-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i.next, %next.iter]
+  %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
+
+  %addr = getelementptr double, double* %arr, i32 %i
+  %nextval = load double, double* %addr
+
+  %tst = fcmp une double %nextval, 42.0
+  br i1 %tst, label %do.add, label %no.add
+
+do.add:
+  %tot.new = fadd double %tot, %nextval
+  br label %next.iter
+
+no.add:
+  br label %next.iter
+
+next.iter:
+  %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add]
+  %i.next = add i32 %i, 1
+  %again = icmp ult i32 %i.next, 32
+  br i1 %again, label %loop, label %done
+
+done:
+  ret double %tot.next
+}
+
+define double @sumIfVector(double* nocapture readonly %arr) {
+; SSE-LABEL: @sumIfVector(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    br label [[LOOP:%.*]]
+; SSE:       loop:
+; SSE-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; SSE-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; SSE-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
+; SSE-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; SSE-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
+; SSE-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
+; SSE:       do.add:
+; SSE-NEXT:    [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
+; SSE-NEXT:    br label [[NEXT_ITER]]
+; SSE:       no.add:
+; SSE-NEXT:    br label [[NEXT_ITER]]
+; SSE:       next.iter:
+; SSE-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
+; SSE-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; SSE-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
+; SSE-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
+; SSE:       done:
+; SSE-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
+; SSE-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+;
+; AVX-LABEL: @sumIfVector(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; AVX-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; AVX-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
+; AVX-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <4 x double>*
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 8
+; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
+; AVX-NEXT:    [[TMP5:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
+; AVX-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; AVX-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x double> [[TMP5]], <4 x double> [[VEC_PHI]]
+; AVX-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; AVX-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; AVX-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; AVX:       middle.block:
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[PREDPHI]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[PREDPHI]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
+; AVX-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
+; AVX:       scalar.ph:
+; AVX-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    br label [[LOOP:%.*]]
+; AVX:       loop:
+; AVX-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; AVX-NEXT:    [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; AVX-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
+; AVX-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; AVX-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
+; AVX-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
+; AVX:       do.add:
+; AVX-NEXT:    [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
+; AVX-NEXT:    br label [[NEXT_ITER]]
+; AVX:       no.add:
+; AVX-NEXT:    br label [[NEXT_ITER]]
+; AVX:       next.iter:
+; AVX-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
+; AVX-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; AVX-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
+; AVX-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop !2
+; AVX:       done:
+; AVX-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i.next, %next.iter]
+  %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
+
+  %addr = getelementptr double, double* %arr, i32 %i
+  %nextval = load double, double* %addr
+
+  %tst = fcmp fast une double %nextval, 42.0
+  br i1 %tst, label %do.add, label %no.add
+
+do.add:
+  %tot.new = fadd fast double %tot, %nextval
+  br label %next.iter
+
+no.add:
+  br label %next.iter
+
+next.iter:
+  %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add]
+  %i.next = add i32 %i, 1
+  %again = icmp ult i32 %i.next, 32
+  br i1 %again, label %loop, label %done
+
+done:
+  ret double %tot.next
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/int128_no_gather.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/int128_no_gather.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/int128_no_gather.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,76 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; This test checks that gather/scatter not used for i128 data type.
+;CHECK-NOT: gather
+;CHECK-NOT: scatter
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at x = common global [151 x i128] zeroinitializer, align 16
+ at .str = private unnamed_addr constant [46 x i8] c" PASS.....Y3 1/1 (BUBBLE SORT), X(25) = 5085\0A\00", align 1
+ at .str.1 = private unnamed_addr constant [44 x i8] c" FAIL.....Y3 1/1 (BUBBLE SORT), X(25) = %d\0A\00", align 1
+ at str = private unnamed_addr constant [45 x i8] c" PASS.....Y3 1/1 (BUBBLE SORT), X(25) = 5085\00"
+
+; Function Attrs: noinline nounwind uwtable
+declare i32 @y3inner() #0 
+
+define i32 @main() local_unnamed_addr #0 {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %j.0 = phi i128 [ 99999, %entry ], [ %add10, %do.body ]
+  %i.0 = phi i128 [ 1, %entry ], [ %add11, %do.body ]
+  %and = and i128 %j.0, 32767
+  %idxprom = trunc i128 %i.0 to i64
+  %arrayidx = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 %idxprom
+  store i128 %and, i128* %arrayidx, align 16
+  %add = add nuw nsw i128 %j.0, 11111
+  %and1 = and i128 %add, 32767
+  %add2 = add nuw nsw i128 %i.0, 1
+  %idxprom3 = trunc i128 %add2 to i64
+  %arrayidx4 = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 %idxprom3
+  store i128 %and1, i128* %arrayidx4, align 16
+  %add5 = add nuw nsw i128 %j.0, 22222
+  %and6 = and i128 %add5, 32767
+  %add7 = add nuw nsw i128 %i.0, 2
+  %idxprom8 = trunc i128 %add7 to i64
+  %arrayidx9 = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 %idxprom8
+  store i128 %and6, i128* %arrayidx9, align 16
+  %add10 = add nuw nsw i128 %j.0, 33333
+  %add11 = add nuw nsw i128 %i.0, 3
+  %cmp = icmp slt i128 %add11, 149
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  store i128 1766649, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 149), align 16
+  store i128 1766649, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 150), align 16
+  %call = tail call i32 @y3inner()
+  %0 = load i128, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 25), align 16
+  %cmp12 = icmp eq i128 %0, 5085
+  br i1 %cmp12, label %if.then, label %if.else
+
+if.then:                                          ; preds = %do.end
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @str, i64 0, i64 0))
+  br label %if.end
+
+if.else:                                          ; preds = %do.end
+  %coerce.sroa.0.0.extract.trunc = trunc i128 %0 to i64
+  %coerce.sroa.2.0.extract.shift = lshr i128 %0, 64
+  %coerce.sroa.2.0.extract.trunc = trunc i128 %coerce.sroa.2.0.extract.shift to i64
+  %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([44 x i8], [44 x i8]* @.str.1, i64 0, i64 0), i64 %coerce.sroa.0.0.extract.trunc, i64 %coerce.sroa.2.0.extract.trunc)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare i32 @printf(i8*, ...) #1
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #2
+
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -loop-vectorize -mtriple x86_64 -S | FileCheck %s
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+; The gaps between the memory access in this function are too large for
+; interleaving.
+
+; Test from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=7560
+define void @test1(%struct.ST4* noalias %B) {
+; CHECK-LABEL: @test1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label %for.body
+
+; CHECK-LABEL: for.body:
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %p1 = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
+  store i32 65536, i32* %p1, align 4
+  %p2 = getelementptr i32, i32* %p1, i32 -2147483648
+  store i32 65536, i32* %p2, align 4
+  %p3 = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
+  store i32 10, i32* %p3, align 4
+  %p4 = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
+  store i32 12, i32* %p4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/interleaving.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/interleaving.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/interleaving.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/interleaving.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=NORMAL
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=ATOM
+
+; NORMAL-LABEL: foo
+; NORMAL: %[[WIDE:.*]] = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; NORMAL: %[[STRIDED1:.*]] = shufflevector <8 x i32> %[[WIDE]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; NORMAL: %[[STRIDED2:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; NORMAL: add nsw <4 x i32> %[[STRIDED2]], %[[STRIDED1]]
+
+; ATOM-LABEL: foo
+; ATOM: load i32
+; ATOM: load i32
+; ATOM: store i32
+define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = or i64 %0, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %3, %1
+  %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add4, i32* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mattr=avx512f -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @inv_load_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT5]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT6]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT6]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1>
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i32> [[PREDPHI]], i32 15
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[A]], null
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[LATCH]], label [[COND_LOAD:%.*]]
+; CHECK:       cond_load:
+; CHECK-NEXT:    [[ALOAD:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[ALOAD]], [[COND_LOAD]] ], [ 1, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7
+; CHECK:       for.end:
+; CHECK-NEXT:    [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[A_LCSSA_LCSSA]]
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp ne i32* %a, null
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_load, label %latch
+
+cond_load:
+  %aload = load i32, i32* %a, align 4
+  br label %latch
+
+latch:
+  %a.lcssa = phi i32 [ %aload, %cond_load ], [ 1, %for.body ]
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 %a.lcssa
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mattr=avx512f  -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; first test checks that loop with a reduction and a uniform store gets
+; vectorized.
+; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction
+; CHECK-LABEL: vector.memcheck:
+; CHECK:    found.conflict
+
+; CHECK-LABEL: vector.body:
+; CHECK:         %vec.phi = phi <16 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         %wide.load = load <16 x i32>
+; CHECK:         [[ADD]] = add <16 x i32> %vec.phi, %wide.load
+; CHECK:         store i32 %ntrunc, i32* %a
+; CHECK-NOT:     store i32 %ntrunc, i32* %a
+; CHECK:         %index.next = add i64 %index, 64
+
+; CHECK-LABEL: middle.block:
+; CHECK:         %rdx.shuf = shufflevector <16 x i32>
+define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  store i32 %ntrunc, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; Conditional store
+; if (b[i] == k) a = ntrunc
+define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @inv_val_store_to_inv_address_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT9]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP5]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT8]], <16 x i32*> [[BROADCAST_SPLAT10]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !14
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32* %c, i32 %k) {
+; CHECK-LABEL: @variant_val_store_to_inv_address_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[C5:%.*]] = bitcast i32* [[C:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i32, i32* [[C]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ugt i32* [[SCEVGEP6]], [[B]]
+; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]]
+; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ugt i32* [[SCEVGEP6]], [[A]]
+; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ugt i8* [[UGLYGEP]], [[C5]]
+; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT16]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT18]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT21:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT20]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT17]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT19]], <16 x i32>* [[TMP5]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP7]], i32 8, <16 x i1> [[TMP4]], <16 x i32> undef), !alias.scope !21
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT21]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !22, !noalias !21
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !23
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 8
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !24
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  %tmp3 = getelementptr inbounds i32, i32* %c, i64 %i
+  %tmp4 = load i32, i32* %tmp3, align 8
+  store i32 %tmp4, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3374 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s  -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2
+; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc_linux"
+
+; The source code:
+;
+;void foo1(int *A, int *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo1(i32* %A, i32* %B, i32* %trigger) {
+; AVX1-LABEL: @foo1(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0
+; AVX1-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef), !alias.scope !3
+; AVX1-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP5]], <8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]]), !alias.scope !5, !noalias !7
+; AVX1-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 8
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !0
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3
+; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP13]], <8 x i32>* [[TMP15]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
+; AVX1-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP19]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
+; AVX1-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP20]], [[TMP19]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
+;
+; AVX2-LABEL: @foo1(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 8
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 24
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 8
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 24
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX2-NEXT:    [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX2-NEXT:    [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX2-NEXT:    [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP20]], <8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 8
+; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP21]], <8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 16
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP22]], <8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 24
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP23]], <8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP33]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 8
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP35]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 16
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP37]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 24
+; AVX2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP39]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 8
+; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16
+; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 24
+; AVX2-NEXT:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX2-NEXT:    [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX2-NEXT:    [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX2-NEXT:    [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP52]], <8 x i32>* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 8
+; AVX2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP53]], <8 x i32>* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 16
+; AVX2-NEXT:    [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP54]], <8 x i32>* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 24
+; AVX2-NEXT:    [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP55]], <8 x i32>* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64
+; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX2-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX2:       for.body.preheader:
+; AVX2-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP68:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
+; AVX2-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4
+; AVX2-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store i32 [[ADD_2]], i32* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP71:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP72:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4
+; AVX2-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store i32 [[ADD_3]], i32* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !10
+;
+; AVX512-LABEL: @foo1(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 32
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 48
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 32
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 48
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX512-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX512-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP20]], <16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP21]], <16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 32
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP22]], <16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 48
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP23]], <16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP33]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 16
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP35]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 32
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP37]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 48
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP39]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16
+; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 32
+; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 48
+; AVX512-NEXT:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX512-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX512-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX512-NEXT:    [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP52]], <16 x i32>* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 16
+; AVX512-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP53]], <16 x i32>* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 32
+; AVX512-NEXT:    [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP54]], <16 x i32>* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 48
+; AVX512-NEXT:    [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP55]], <16 x i32>* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128
+; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX512-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP68:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
+; AVX512-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4
+; AVX512-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store i32 [[ADD_2]], i32* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP71:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP72:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4
+; AVX512-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store i32 [[ADD_3]], i32* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !10
+;
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8
+  store i32* %B, i32** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32*, i32** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32*, i32** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
+  store i32 %add, i32* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The same as @foo1 but all the pointers are address space 1 pointers.
+
+; Function Attrs: nounwind uwtable
+define void @foo1_addrspace1(i32 addrspace(1)*  %A, i32 addrspace(1)* %B, i32 addrspace(1)* %trigger) {
+; AVX1-LABEL: @foo1_addrspace1(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
+; AVX1-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP4:%.*]] = bitcast i32 addrspace(1)* [[TMP3]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef), !alias.scope !14
+; AVX1-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP5]], <8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP2]]), !alias.scope !16, !noalias !18
+; AVX1-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 8
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP12:%.*]] = bitcast i32 addrspace(1)* [[TMP11]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14
+; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP13]], <8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
+; AVX1-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP17:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP19:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP19]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP20:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
+; AVX1-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP20]], [[TMP19]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20
+;
+; AVX2-LABEL: @foo1_addrspace1(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 8
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 16
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 24
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 8
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 16
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 24
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX2-NEXT:    [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX2-NEXT:    [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX2-NEXT:    [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP20]], <8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 8
+; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP21]], <8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 16
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP22]], <8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 24
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP23]], <8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 8
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 16
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 24
+; AVX2-NEXT:    [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 8
+; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 16
+; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 24
+; AVX2-NEXT:    [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX2-NEXT:    [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX2-NEXT:    [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX2-NEXT:    [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP52]], <8 x i32> addrspace(1)* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 8
+; AVX2-NEXT:    [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP53]], <8 x i32> addrspace(1)* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 16
+; AVX2-NEXT:    [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP54]], <8 x i32> addrspace(1)* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 24
+; AVX2-NEXT:    [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP55]], <8 x i32> addrspace(1)* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64
+; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX2-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX2:       for.body.preheader:
+; AVX2-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP67:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP68:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
+; AVX2-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP69:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP70:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_2]], align 4
+; AVX2-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store i32 [[ADD_2]], i32 addrspace(1)* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP71:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP72:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_3]], align 4
+; AVX2-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store i32 [[ADD_3]], i32 addrspace(1)* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !20
+;
+; AVX512-LABEL: @foo1_addrspace1(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 32
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 48
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 32
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 48
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX512-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX512-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP20]], <16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP21]], <16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 32
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP22]], <16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 48
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP23]], <16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 16
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 32
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 48
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 16
+; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 32
+; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 48
+; AVX512-NEXT:    [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX512-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX512-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX512-NEXT:    [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP52]], <16 x i32> addrspace(1)* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 16
+; AVX512-NEXT:    [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP53]], <16 x i32> addrspace(1)* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 32
+; AVX512-NEXT:    [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP54]], <16 x i32> addrspace(1)* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 48
+; AVX512-NEXT:    [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP55]], <16 x i32> addrspace(1)* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128
+; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX512-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP67:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP68:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
+; AVX512-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP69:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP70:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_2]], align 4
+; AVX512-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store i32 [[ADD_2]], i32 addrspace(1)* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP71:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP72:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_3]], align 4
+; AVX512-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store i32 [[ADD_3]], i32 addrspace(1)* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !20
+;
+entry:
+  %A.addr = alloca i32 addrspace(1)*, align 8
+  %B.addr = alloca i32 addrspace(1)*, align 8
+  %trigger.addr = alloca i32 addrspace(1)*, align 8
+  %i = alloca i32, align 4
+  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 8
+  store i32 addrspace(1)* %B, i32 addrspace(1)** %B.addr, align 8
+  store i32 addrspace(1)* %trigger, i32 addrspace(1)** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32 addrspace(1)*, i32 addrspace(1)** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom2
+  %6 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %8, i64 %idxprom4
+  %9 = load i32, i32 addrspace(1)* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %11, i64 %idxprom6
+  store i32 %add, i32 addrspace(1)* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo2(float *A, float *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo2(float* %A, float* %B, i32* %trigger) {
+; AVX1-LABEL: @foo2(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8
+; AVX1-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
+; AVX1-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24
+; AVX1-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
+; AVX1-NEXT:    [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float>
+; AVX1-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float>
+; AVX1-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float>
+; AVX1-NEXT:    [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX1-NEXT:    [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX1-NEXT:    [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX1-NEXT:    [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 8
+; AVX1-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
+; AVX1-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 24
+; AVX1-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX1-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX1-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX1:       for.body.preheader:
+; AVX1-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
+; AVX1-NEXT:    [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
+; AVX1-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
+;
+; AVX2-LABEL: @foo2(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
+; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
+; AVX2-NEXT:    [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float>
+; AVX2-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float>
+; AVX2-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float>
+; AVX2-NEXT:    [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX2-NEXT:    [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX2-NEXT:    [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX2-NEXT:    [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 8
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 24
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX2-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX2:       for.body.preheader:
+; AVX2-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
+; AVX2-NEXT:    [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
+; AVX2-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP44:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
+; AVX2-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to float
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd float [[TMP44]], [[CONV_2]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store float [[ADD_2]], float* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP46:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
+; AVX2-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to float
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd float [[TMP46]], [[CONV_3]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store float [[ADD_3]], float* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
+;
+; AVX512-LABEL: @foo2(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 32
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 48
+; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP13:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP17]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 32
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP19]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 48
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP21]], i32 4, <16 x i1> [[TMP13]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP22:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
+; AVX512-NEXT:    [[TMP23:%.*]] = sitofp <16 x i32> [[WIDE_LOAD22]] to <16 x float>
+; AVX512-NEXT:    [[TMP24:%.*]] = sitofp <16 x i32> [[WIDE_LOAD23]] to <16 x float>
+; AVX512-NEXT:    [[TMP25:%.*]] = sitofp <16 x i32> [[WIDE_LOAD24]] to <16 x float>
+; AVX512-NEXT:    [[TMP26:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX512-NEXT:    [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX512-NEXT:    [[TMP29:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP26]], <16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 32
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP28]], <16 x float>* [[TMP35]], i32 4, <16 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 48
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP29]], <16 x float>* [[TMP37]], i32 4, <16 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
+; AVX512-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
+; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
+; AVX512-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP44:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
+; AVX512-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to float
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd float [[TMP44]], [[CONV_2]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store float [[ADD_2]], float* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP46:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
+; AVX512-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to float
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd float [[TMP46]], [[CONV_3]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store float [[ADD_3]], float* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
+;
+entry:
+  %A.addr = alloca float*, align 8
+  %B.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store float* %A, float** %A.addr, align 8
+  store float* %B, float** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load float*, float** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds float, float* %5, i64 %idxprom2
+  %6 = load float, float* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to float
+  %add = fadd float %6, %conv
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load float*, float** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds float, float* %11, i64 %idxprom6
+  store float %add, float* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo3(double *A, double *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo3(double* %A, double* %B, i32* %trigger) #0 {
+; AVX1-LABEL: @foo3(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 12
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4
+; AVX1-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
+; AVX1-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12
+; AVX1-NEXT:    [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
+; AVX1-NEXT:    [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double>
+; AVX1-NEXT:    [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double>
+; AVX1-NEXT:    [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double>
+; AVX1-NEXT:    [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX1-NEXT:    [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX1-NEXT:    [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX1-NEXT:    [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 4
+; AVX1-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
+; AVX1-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 12
+; AVX1-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
+; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX1-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
+;
+; AVX2-LABEL: @foo3(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 12
+; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
+; AVX2-NEXT:    [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double>
+; AVX2-NEXT:    [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double>
+; AVX2-NEXT:    [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double>
+; AVX2-NEXT:    [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX2-NEXT:    [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX2-NEXT:    [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX2-NEXT:    [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 4
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 12
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX2-NEXT:    br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
+; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX2-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP44:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX2-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to double
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP44]], [[CONV_2]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP46:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX2-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to double
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP46]], [[CONV_3]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
+;
+; AVX512-LABEL: @foo3(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
+; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP15]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 16
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP12]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 24
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast double* [[TMP20]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
+; AVX512-NEXT:    [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x double>
+; AVX512-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x double>
+; AVX512-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x double>
+; AVX512-NEXT:    [[TMP26:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX512-NEXT:    [[TMP27:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX512-NEXT:    [[TMP29:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP26]], <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP27]], <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 16
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP28]], <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 24
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP29]], <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX512-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP44:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX512-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to double
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP44]], [[CONV_2]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP46:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX512-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to double
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP46]], [[CONV_3]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !40
+;
+entry:
+  %A.addr = alloca double*, align 8
+  %B.addr = alloca double*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store double* %B, double** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load double*, double** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to double
+  %add = fadd double %6, %conv
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load double*, double** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
+  store double %add, double* %arrayidx7, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo4(double *A, double *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i += 16) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i*2] + trigger[i]; << non-cosecutive access
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo4(double* %A, double* %B, i32* %trigger)  {
+; AVX1-LABEL: @foo4(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16
+; AVX1-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[CMP]], label [[FOR_BODY_1:%.*]], label [[FOR_END:%.*]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       for.body.1:
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]]
+; AVX1-NEXT:    [[TMP5:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX1-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP3]] to double
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP5]], [[CONV_1]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 32
+; AVX1-NEXT:    br label [[FOR_BODY]]
+;
+; AVX2-LABEL: @foo4(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16
+; AVX2-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
+; AVX2-NEXT:    br i1 [[CMP]], label [[FOR_BODY_1:%.*]], label [[FOR_END:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       for.body.1:
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX2-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP3]] to double
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP5]], [[CONV_1]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 32
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP6]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP7]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX2-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP6]] to double
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP8]], [[CONV_2]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 48
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP9]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP10]]
+; AVX2-NEXT:    [[TMP11:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX2-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP9]] to double
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP11]], [[CONV_3]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 64
+; AVX2-NEXT:    br label [[FOR_BODY]]
+;
+; AVX512-LABEL: @foo4(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 9985
+; AVX512-NEXT:    [[SCEVGEP12:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 9985
+; AVX512-NEXT:    [[SCEVGEP15:%.*]] = getelementptr double, double* [[B:%.*]], i64 19969
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP12]] to double*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND017:%.*]] = icmp ugt double* [[SCEVGEP15]], [[A]]
+; AVX512-NEXT:    [[BOUND118:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT19]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[VEC_IND_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[ENTRY]] ]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
+; AVX512-NEXT:    [[TMP3:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP4:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP4]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER20:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP5]], i32 8, <8 x i1> [[TMP3]], <8 x double> undef), !alias.scope !44
+; AVX512-NEXT:    [[TMP6:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
+; AVX512-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20]], [[TMP6]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP7]], <8 x double*> [[TMP8]], i32 8, <8 x i1> [[TMP3]]), !alias.scope !46, !noalias !48
+; AVX512-NEXT:    [[VEC_IND_NEXT:%.*]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND_NEXT]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND_NEXT]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP11]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER20_1:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP12]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !44
+; AVX512-NEXT:    [[TMP13:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER_1]] to <8 x double>
+; AVX512-NEXT:    [[TMP14:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_1]], [[TMP13]]
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP14]], <8 x double*> [[TMP15]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !46, !noalias !48
+; AVX512-NEXT:    [[VEC_IND_NEXT_1:%.*]] = add <8 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND_NEXT_1]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER_2]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP18:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND_NEXT_1]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP18]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER20_2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP19]], i32 8, <8 x i1> [[TMP17]], <8 x double> undef), !alias.scope !44
+; AVX512-NEXT:    [[TMP20:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER_2]] to <8 x double>
+; AVX512-NEXT:    [[TMP21:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_2]], [[TMP20]]
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT_1]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP21]], <8 x double*> [[TMP22]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !46, !noalias !48
+; AVX512-NEXT:    [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX]], 24
+; AVX512-NEXT:    [[VEC_IND_NEXT_2]] = add <8 x i64> [[VEC_IND]], <i64 384, i64 384, i64 384, i64 384, i64 384, i64 384, i64 384, i64 384>
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT_2]], 624
+; AVX512-NEXT:    br i1 [[TMP23]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 9999, [[INDVARS_IV_PH]]
+; AVX512-NEXT:    br label [[FOR_BODY_PROL:%.*]]
+; AVX512:       for.body.prol:
+; AVX512-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_INC_PROL:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ]
+; AVX512-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_INC_PROL]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
+; AVX512-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_PROL]]
+; AVX512-NEXT:    [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX_PROL]], align 4
+; AVX512-NEXT:    [[CMP1_PROL:%.*]] = icmp slt i32 [[TMP25]], 100
+; AVX512-NEXT:    br i1 [[CMP1_PROL]], label [[IF_THEN_PROL:%.*]], label [[FOR_INC_PROL]]
+; AVX512:       if.then.prol:
+; AVX512-NEXT:    [[TMP26:%.*]] = shl nuw nsw i64 [[INDVARS_IV_PROL]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_PROL:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP26]]
+; AVX512-NEXT:    [[TMP27:%.*]] = load double, double* [[ARRAYIDX3_PROL]], align 8
+; AVX512-NEXT:    [[CONV_PROL:%.*]] = sitofp i32 [[TMP25]] to double
+; AVX512-NEXT:    [[ADD_PROL:%.*]] = fadd double [[TMP27]], [[CONV_PROL]]
+; AVX512-NEXT:    [[ARRAYIDX7_PROL:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_PROL]]
+; AVX512-NEXT:    store double [[ADD_PROL]], double* [[ARRAYIDX7_PROL]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_PROL]]
+; AVX512:       for.inc.prol:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 16
+; AVX512-NEXT:    [[PROL_ITER_SUB]] = add i64 [[PROL_ITER]], -1
+; AVX512-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp eq i64 [[PROL_ITER_SUB]], 0
+; AVX512-NEXT:    br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL]], !llvm.loop !50
+; AVX512:       for.body.prol.loopexit:
+; AVX512-NEXT:    [[DOTMASK:%.*]] = and i64 [[TMP24]], 9984
+; AVX512-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[DOTMASK]], 0
+; AVX512-NEXT:    br i1 [[TMP28]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL_LOOPEXIT]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[TMP30:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP30]]
+; AVX512-NEXT:    [[TMP31:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP29]] to double
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP31]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 16
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP32:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP32]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[TMP33:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP33]]
+; AVX512-NEXT:    [[TMP34:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX512-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP32]] to double
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP34]], [[CONV_1]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], 32
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP35]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[TMP36:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP36]]
+; AVX512-NEXT:    [[TMP37:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX512-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP35]] to double
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP37]], [[CONV_2]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], 48
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP38]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[TMP39:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP39]]
+; AVX512-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX512-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP38]] to double
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP40]], [[CONV_3]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], 64
+; AVX512-NEXT:    [[CMP_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[CMP_3]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !52
+;
+entry:
+  %A.addr = alloca double*, align 8
+  %B.addr = alloca double*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store double* %B, double** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %mul = mul nsw i32 %4, 2
+  %idxprom2 = sext i32 %mul to i64
+  %5 = load double*, double** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to double
+  %add = fadd double %6, %conv
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load double*, double** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
+  store double %add, double* %arrayidx7, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+ at a = common global [1 x i32*] zeroinitializer, align 8
+ at c = common global i32* null, align 8
+
+; The loop here should not be vectorized due to trapping
+; constant expression
+; Function Attrs: nounwind uwtable
+define void @foo5(i32* %A, i32* %B, i32* %trigger) {
+; AVX1-LABEL: @foo5(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+;
+; AVX2-LABEL: @foo5(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_4:%.*]], [[FOR_INC_4:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP2]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3:%.*]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; AVX2-NEXT:    [[CMP1_4:%.*]] = icmp slt i32 [[TMP4]], 100
+; AVX2-NEXT:    br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[FOR_INC_4]]
+; AVX2:       if.then.4:
+; AVX2-NEXT:    [[ARRAYIDX7_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_4]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_4]]
+; AVX2:       for.inc.4:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_4]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; AVX2-NEXT:    [[EXITCOND_4:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_4]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_4]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+;
+; AVX512-LABEL: @foo5(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_4:%.*]], [[FOR_INC_4:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP2]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3:%.*]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX512-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; AVX512-NEXT:    [[CMP1_4:%.*]] = icmp slt i32 [[TMP4]], 100
+; AVX512-NEXT:    br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[FOR_INC_4]]
+; AVX512:       if.then.4:
+; AVX512-NEXT:    [[ARRAYIDX7_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_4]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_4]]
+; AVX512:       for.inc.4:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_4]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; AVX512-NEXT:    [[EXITCOND_4:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_4]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_4]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8
+  store i32* %B, i32** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32*, i32** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32*, i32** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
+  store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32)), i32* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Reverse loop
+;void foo6(double *in, double *out, unsigned size, int *trigger) {
+;
+;  for (int i=SIZE-1; i>=0; i--) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[i] + (double) 0.5;
+;    }
+;  }
+;}
+
+
+define void @foo6(double* %in, double* %out, i32 %size, i32* %trigger) {
+; AVX1-LABEL: @foo6(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
+; AVX1-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
+; AVX1-NEXT:    [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
+; AVX1-NEXT:    [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
+; AVX1-NEXT:    [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3
+; AVX1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -4
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -3
+; AVX1-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -12
+; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -3
+; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
+; AVX1-NEXT:    [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer
+; AVX1-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3
+; AVX1-NEXT:    [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4
+; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3
+; AVX1-NEXT:    [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
+; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3
+; AVX1-NEXT:    [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12
+; AVX1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3
+; AVX1-NEXT:    [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -3
+; AVX1-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -4
+; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -3
+; AVX1-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
+; AVX1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -3
+; AVX1-NEXT:    [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -12
+; AVX1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -3
+; AVX1-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX1-NEXT:    br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 4095, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
+; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX1-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nsw i64 [[INDVARS_IV]], -2
+; AVX1-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 0
+; AVX1-NEXT:    br i1 [[CMP_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50
+;
+; AVX2-LABEL: @foo6(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
+; AVX2-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
+; AVX2-NEXT:    [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
+; AVX2-NEXT:    [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
+; AVX2-NEXT:    [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3
+; AVX2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -4
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -3
+; AVX2-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -12
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -3
+; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
+; AVX2-NEXT:    [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer
+; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3
+; AVX2-NEXT:    [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3
+; AVX2-NEXT:    [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3
+; AVX2-NEXT:    [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3
+; AVX2-NEXT:    [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
+; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -3
+; AVX2-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -4
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -3
+; AVX2-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
+; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -3
+; AVX2-NEXT:    [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -12
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -3
+; AVX2-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX2-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX2-NEXT:    br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], -4
+; AVX2-NEXT:    [[CMP_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], 0
+; AVX2-NEXT:    br i1 [[CMP_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50
+;
+; AVX512-LABEL: @foo6(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
+; AVX512-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
+; AVX512-NEXT:    [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
+; AVX512-NEXT:    [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
+; AVX512-NEXT:    [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -7
+; AVX512-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP4]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE21:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD20]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -16
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -7
+; AVX512-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE23:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD22]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -24
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -7
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE25:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD24]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP14:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
+; AVX512-NEXT:    [[TMP15:%.*]] = icmp sgt <8 x i32> [[REVERSE21]], zeroinitializer
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE23]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE25]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -7
+; AVX512-NEXT:    [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP19]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP20]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -7
+; AVX512-NEXT:    [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP22]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP23]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -16
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -7
+; AVX512-NEXT:    [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP26:%.*]] = bitcast double* [[TMP25]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP26]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -24
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -7
+; AVX512-NEXT:    [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP29]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP30:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -7
+; AVX512-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP30]], <8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -7
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP31]], <8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE28]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -16
+; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -7
+; AVX512-NEXT:    [[TMP42:%.*]] = bitcast double* [[TMP41]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP32]], <8 x double>* [[TMP42]], i32 8, <8 x i1> [[REVERSE31]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -24
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -7
+; AVX512-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP33]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[REVERSE34]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX512-NEXT:    br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !61
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], -4
+; AVX512-NEXT:    [[CMP_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], 0
+; AVX512-NEXT:    br i1 [[CMP_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !62
+;
+entry:
+  %in.addr = alloca double*, align 8
+  %out.addr = alloca double*, align 8
+  %size.addr = alloca i32, align 4
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %in, double** %in.addr, align 8
+  store double* %out, double** %out.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 4095, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp sge i32 %0, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load double*, double** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %add = fadd double %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load double*, double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %8, i64 %idxprom4
+  store double %add, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %dec = add nsw i32 %9, -1
+  store i32 %dec, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; void foo7 (double * __restrict__  out, double ** __restrict__  in,
+;           bool * __restrict__ trigger, unsigned size) {
+;
+;  for (unsigned i=0; i<size; i++)
+;    if (trigger[i] && (in[i] != 0))
+;      out[i] = (double) 0.5;
+; }
+
+define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 {
+; AVX-LABEL: @foo7(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX:       for.body.preheader:
+; AVX-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 16
+; AVX-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
+; AVX-NEXT:    [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
+; AVX-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
+; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
+; AVX-NEXT:    [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
+; AVX-NEXT:    [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12
+; AVX-NEXT:    [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX-NEXT:    [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX-NEXT:    [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX-NEXT:    [[TMP23:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
+; AVX-NEXT:    [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
+; AVX-NEXT:    [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
+; AVX-NEXT:    [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
+; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
+; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
+; AVX-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
+; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
+; AVX:       middle.block:
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX:       for.body.preheader16:
+; AVX-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX:       for.body:
+; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX:       land.lhs.true:
+; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
+; AVX-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX:       if.then:
+; AVX-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX-NEXT:    br label [[FOR_INC]]
+; AVX:       for.inc:
+; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !52
+; AVX:       for.end:
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @foo7(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX512-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 32
+; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX512:       vector.ph:
+; AVX512-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967264
+; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
+; AVX512-NEXT:    [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
+; AVX512-NEXT:    [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
+; AVX512-NEXT:    [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
+; AVX512-NEXT:    [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
+; AVX512-NEXT:    [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
+; AVX512-NEXT:    [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
+; AVX512-NEXT:    [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63
+; AVX512:       middle.block:
+; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX512:       for.body.preheader16:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX512:       land.lhs.true:
+; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
+; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !64
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+;
+entry:
+  %out.addr = alloca double*, align 8
+  %in.addr = alloca double**, align 8
+  %trigger.addr = alloca i8*, align 8
+  %size.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store double* %out, double** %out.addr, align 8
+  store double** %in, double*** %in.addr, align 8
+  store i8* %trigger, i8** %trigger.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %size.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = zext i32 %2 to i64
+  %3 = load i8*, i8** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i8, i8* %3, i64 %idxprom
+  %4 = load i8, i8* %arrayidx, align 1
+  %tobool = trunc i8 %4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %idxprom1 = zext i32 %5 to i64
+  %6 = load double**, double*** %in.addr, align 8
+  %arrayidx2 = getelementptr inbounds double*, double** %6, i64 %idxprom1
+  %7 = load double*, double** %arrayidx2, align 8
+  %cmp3 = icmp ne double* %7, null
+  br i1 %cmp3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  %8 = load i32, i32* %i, align 4
+  %idxprom4 = zext i32 %8 to i64
+  %9 = load double*, double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %9, i64 %idxprom4
+  store double 5.000000e-01, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %10 = load i32, i32* %i, align 4
+  %inc = add i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+;typedef int (*fp)();
+;void foo8 (double* __restrict__  out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) {
+;
+;  for (unsigned i=0; i<size; i++)
+;    if (trigger[i] && (in[i] != 0))
+;      out[i] = (double) 0.5;
+;}
+
+define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 {
+; AVX-LABEL: @foo8(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX:       for.body.preheader:
+; AVX-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 16
+; AVX-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
+; AVX-NEXT:    [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
+; AVX-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
+; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
+; AVX-NEXT:    [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
+; AVX-NEXT:    [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12
+; AVX-NEXT:    [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX-NEXT:    [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX-NEXT:    [[TMP23:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
+; AVX-NEXT:    [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
+; AVX-NEXT:    [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
+; AVX-NEXT:    [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
+; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
+; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
+; AVX-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
+; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
+; AVX:       middle.block:
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX:       for.body.preheader16:
+; AVX-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX:       for.body:
+; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX:       land.lhs.true:
+; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
+; AVX-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX:       if.then:
+; AVX-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX-NEXT:    br label [[FOR_INC]]
+; AVX:       for.inc:
+; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !55
+; AVX:       for.end:
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @foo8(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX512-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 32
+; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX512:       vector.ph:
+; AVX512-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967264
+; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
+; AVX512-NEXT:    [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
+; AVX512-NEXT:    [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
+; AVX512-NEXT:    [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
+; AVX512-NEXT:    [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
+; AVX512-NEXT:    [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
+; AVX512-NEXT:    [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
+; AVX512-NEXT:    [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66
+; AVX512:       middle.block:
+; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX512:       for.body.preheader16:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX512:       land.lhs.true:
+; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
+; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !67
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+;
+entry:
+  %out.addr = alloca double*, align 8
+  %in.addr = alloca i32 ()**, align 8
+  %trigger.addr = alloca i8*, align 8
+  %size.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store double* %out, double** %out.addr, align 8
+  store i32 ()** %in, i32 ()*** %in.addr, align 8
+  store i8* %trigger, i8** %trigger.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %size.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = zext i32 %2 to i64
+  %3 = load i8*, i8** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i8, i8* %3, i64 %idxprom
+  %4 = load i8, i8* %arrayidx, align 1
+  %tobool = trunc i8 %4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %idxprom1 = zext i32 %5 to i64
+  %6 = load i32 ()**, i32 ()*** %in.addr, align 8
+  %arrayidx2 = getelementptr inbounds i32 ()*, i32 ()** %6, i64 %idxprom1
+  %7 = load i32 ()*, i32 ()** %arrayidx2, align 8
+  %cmp3 = icmp ne i32 ()* %7, null
+  br i1 %cmp3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  %8 = load i32, i32* %i, align 4
+  %idxprom4 = zext i32 %8 to i64
+  %9 = load double*, double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %9, i64 %idxprom4
+  store double 5.000000e-01, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %10 = load i32, i32* %i, align 4
+  %inc = add i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/max-mstore.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/max-mstore.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/max-mstore.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/max-mstore.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; RUN: opt -basicaa -loop-vectorize -force-vector-interleave=1 -S -mcpu=core-avx2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at b = common global [256 x i32] zeroinitializer, align 16
+ at a = common global [256 x i32] zeroinitializer, align 16
+
+; unsigned int a[256], b[256];
+; void foo() {
+;  for (i = 0; i < 256; i++) {
+;    if (b[i] > a[i])
+;      a[i] = b[i];
+;  }
+; }
+
+; CHECK-LABEL: foo
+; CHECK: load <8 x i32>
+; CHECK: icmp ugt <8 x i32>
+; CHECK: masked.store
+
+define void @foo() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* @b, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [256 x i32], [256 x i32]* @a, i64 0, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %cmp3 = icmp ugt i32 %0, %1
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  store i32 %0, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/metadata-enable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/metadata-enable.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/metadata-enable.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/metadata-enable.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2473 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1
+; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-threshold=150 -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DEFAULT
+; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os
+; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz
+; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC
+; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC
+; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2
+; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2
+; RUN: opt < %s -mcpu=corei7 -O3 -unroll-threshold=150 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
+
+; This file tests the llvm.loop.vectorize.enable metadata forcing
+; vectorization even when optimization levels are too low, or when
+; vectorization is disabled.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @enabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+; O1-LABEL: @enabled(
+; O1-NEXT:  entry:
+; O1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    ret i32 [[TMP78]]
+;
+; O2-LABEL: @enabled(
+; O2-NEXT:  entry:
+; O2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP78]]
+;
+; O3-LABEL: @enabled(
+; O3-NEXT:  entry:
+; O3-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP78]]
+;
+; O3DEFAULT-LABEL: @enabled(
+; O3DEFAULT-NEXT:  entry:
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3DEFAULT-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DEFAULT-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3DEFAULT-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3DEFAULT-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3DEFAULT-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3DEFAULT-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3DEFAULT-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3DEFAULT-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3DEFAULT-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3DEFAULT-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3DEFAULT-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3DEFAULT-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3DEFAULT-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3DEFAULT-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3DEFAULT-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3DEFAULT-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3DEFAULT-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3DEFAULT-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3DEFAULT-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP78]]
+;
+; Os-LABEL: @enabled(
+; Os-NEXT:  entry:
+; Os-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; Os-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; Os-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; Os-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; Os-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; Os-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; Os-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; Os-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; Os-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; Os-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; Os-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; Os-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; Os-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; Os-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; Os-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; Os-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; Os-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; Os-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; Os-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; Os-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; Os-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; Os-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; Os-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; Os-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; Os-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; Os-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; Os-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; Os-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; Os-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; Os-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; Os-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; Os-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; Os-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; Os-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; Os-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; Os-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; Os-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; Os-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; Os-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; Os-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; Os-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; Os-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; Os-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; Os-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; Os-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; Os-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; Os-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; Os-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; Os-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; Os-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; Os-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; Os-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; Os-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; Os-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; Os-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; Os-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP78]]
+;
+; Oz-LABEL: @enabled(
+; Oz-NEXT:  entry:
+; Oz-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; Oz-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; Oz-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; Oz-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; Oz-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; Oz-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; Oz-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; Oz-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; Oz-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; Oz-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; Oz-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; Oz-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; Oz-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; Oz-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; Oz-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; Oz-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; Oz-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; Oz-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; Oz-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; Oz-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; Oz-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; Oz-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; Oz-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; Oz-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; Oz-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; Oz-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; Oz-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; Oz-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; Oz-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; Oz-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; Oz-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; Oz-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; Oz-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; Oz-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; Oz-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; Oz-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; Oz-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; Oz-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; Oz-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; Oz-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; Oz-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; Oz-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; Oz-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; Oz-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; Oz-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; Oz-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; Oz-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; Oz-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; Oz-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; Oz-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; Oz-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; Oz-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; Oz-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; Oz-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; Oz-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; Oz-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; Oz-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; Oz-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; Oz-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; Oz-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; Oz-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; Oz-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; Oz-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; Oz-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; Oz-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    ret i32 [[TMP78]]
+;
+; O1VEC-LABEL: @enabled(
+; O1VEC-NEXT:  entry:
+; O1VEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1VEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1VEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1VEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1VEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1VEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1VEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1VEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1VEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1VEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1VEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1VEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1VEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1VEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1VEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1VEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1VEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1VEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1VEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1VEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1VEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1VEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1VEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1VEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1VEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1VEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1VEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1VEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1VEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1VEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1VEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1VEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1VEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1VEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1VEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1VEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1VEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1VEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1VEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1VEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1VEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1VEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1VEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1VEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1VEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1VEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1VEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1VEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1VEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1VEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1VEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1VEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1VEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1VEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1VEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1VEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1VEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1VEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1VEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1VEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1VEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC-NEXT:    ret i32 [[TMP78]]
+;
+; OzVEC-LABEL: @enabled(
+; OzVEC-NEXT:  entry:
+; OzVEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; OzVEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; OzVEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; OzVEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; OzVEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; OzVEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; OzVEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; OzVEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; OzVEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; OzVEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; OzVEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; OzVEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; OzVEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; OzVEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; OzVEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; OzVEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; OzVEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; OzVEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; OzVEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; OzVEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; OzVEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; OzVEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; OzVEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; OzVEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; OzVEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; OzVEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; OzVEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; OzVEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; OzVEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; OzVEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; OzVEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; OzVEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; OzVEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; OzVEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; OzVEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; OzVEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; OzVEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; OzVEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; OzVEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; OzVEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; OzVEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; OzVEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; OzVEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; OzVEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; OzVEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; OzVEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; OzVEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; OzVEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; OzVEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; OzVEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; OzVEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; OzVEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; OzVEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; OzVEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; OzVEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; OzVEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; OzVEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; OzVEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; OzVEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; OzVEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; OzVEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; OzVEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; OzVEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC-NEXT:    ret i32 [[TMP78]]
+;
+; O1VEC2-LABEL: @enabled(
+; O1VEC2-NEXT:  entry:
+; O1VEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1VEC2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1VEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1VEC2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1VEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1VEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1VEC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1VEC2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1VEC2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1VEC2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1VEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1VEC2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1VEC2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1VEC2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1VEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1VEC2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1VEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1VEC2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1VEC2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1VEC2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1VEC2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1VEC2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1VEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1VEC2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1VEC2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1VEC2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1VEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1VEC2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1VEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1VEC2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1VEC2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1VEC2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1VEC2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1VEC2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1VEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1VEC2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1VEC2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1VEC2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1VEC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1VEC2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1VEC2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1VEC2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1VEC2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1VEC2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1VEC2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1VEC2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1VEC2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1VEC2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1VEC2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1VEC2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1VEC2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1VEC2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1VEC2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1VEC2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1VEC2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1VEC2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1VEC2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1VEC2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1VEC2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1VEC2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1VEC2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1VEC2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1VEC2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP78]]
+;
+; OzVEC2-LABEL: @enabled(
+; OzVEC2-NEXT:  entry:
+; OzVEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; OzVEC2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; OzVEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; OzVEC2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; OzVEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; OzVEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; OzVEC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; OzVEC2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; OzVEC2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; OzVEC2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; OzVEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; OzVEC2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; OzVEC2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; OzVEC2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; OzVEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; OzVEC2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; OzVEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; OzVEC2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; OzVEC2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; OzVEC2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; OzVEC2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; OzVEC2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; OzVEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; OzVEC2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; OzVEC2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; OzVEC2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; OzVEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; OzVEC2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; OzVEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; OzVEC2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; OzVEC2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; OzVEC2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; OzVEC2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; OzVEC2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; OzVEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; OzVEC2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; OzVEC2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; OzVEC2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; OzVEC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; OzVEC2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; OzVEC2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; OzVEC2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; OzVEC2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; OzVEC2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; OzVEC2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; OzVEC2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; OzVEC2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; OzVEC2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; OzVEC2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; OzVEC2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; OzVEC2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; OzVEC2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; OzVEC2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; OzVEC2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; OzVEC2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; OzVEC2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; OzVEC2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; OzVEC2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; OzVEC2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; OzVEC2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; OzVEC2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; OzVEC2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; OzVEC2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP78]]
+;
+; O3DIS-LABEL: @enabled(
+; O3DIS-NEXT:  entry:
+; O3DIS-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DIS-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DIS-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DIS-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3DIS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DIS-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3DIS-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DIS-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3DIS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DIS-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3DIS-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DIS-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3DIS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DIS-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DIS-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DIS-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3DIS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DIS-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3DIS-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DIS-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3DIS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DIS-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3DIS-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DIS-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3DIS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DIS-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3DIS-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DIS-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3DIS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DIS-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DIS-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DIS-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3DIS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DIS-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3DIS-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DIS-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3DIS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DIS-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3DIS-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DIS-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3DIS-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DIS-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3DIS-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DIS-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3DIS-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DIS-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3DIS-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DIS-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3DIS-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3DIS-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3DIS-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3DIS-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3DIS-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3DIS-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3DIS-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3DIS-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3DIS-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3DIS-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3DIS-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3DIS-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3DIS-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3DIS-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3DIS-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3DIS-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3DIS-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    ret i32 [[TMP78]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %N
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  %1 = load i32, i32* %a, align 4
+  ret i32 %1
+}
+
+define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+; O1-LABEL: @nopragma(
+; O1-NEXT:  entry:
+; O1-NEXT:    br label [[FOR_BODY:%.*]]
+; O1:       for.body:
+; O1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; O1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; O1:       for.end:
+; O1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    ret i32 [[TMP1]]
+;
+; O2-LABEL: @nopragma(
+; O2-NEXT:  entry:
+; O2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP78]]
+;
+; O3-LABEL: @nopragma(
+; O3-NEXT:  entry:
+; O3-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP78]]
+;
+; O3DEFAULT-LABEL: @nopragma(
+; O3DEFAULT-NEXT:  entry:
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3DEFAULT-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DEFAULT-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3DEFAULT-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3DEFAULT-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3DEFAULT-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3DEFAULT-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3DEFAULT-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3DEFAULT-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3DEFAULT-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3DEFAULT-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3DEFAULT-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3DEFAULT-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3DEFAULT-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3DEFAULT-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3DEFAULT-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3DEFAULT-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3DEFAULT-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3DEFAULT-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3DEFAULT-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP78]]
+;
+; Os-LABEL: @nopragma(
+; Os-NEXT:  entry:
+; Os-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; Os-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; Os-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; Os-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; Os-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; Os-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; Os-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; Os-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; Os-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; Os-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; Os-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; Os-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; Os-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; Os-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; Os-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; Os-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; Os-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; Os-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; Os-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; Os-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; Os-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; Os-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; Os-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; Os-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; Os-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; Os-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; Os-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; Os-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; Os-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; Os-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; Os-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; Os-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; Os-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; Os-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; Os-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; Os-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; Os-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; Os-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; Os-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; Os-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; Os-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; Os-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; Os-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; Os-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; Os-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; Os-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; Os-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; Os-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; Os-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; Os-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; Os-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; Os-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; Os-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; Os-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; Os-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; Os-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP78]]
+;
+; Oz-LABEL: @nopragma(
+; Oz-NEXT:  entry:
+; Oz-NEXT:    br label [[FOR_BODY:%.*]]
+; Oz:       for.body:
+; Oz-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; Oz-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Oz-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; Oz-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Oz-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; Oz-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; Oz-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; Oz:       for.end:
+; Oz-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    ret i32 [[TMP1]]
+;
+; O1VEC-LABEL: @nopragma(
+; O1VEC-NEXT:  entry:
+; O1VEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1VEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1VEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1VEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1VEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1VEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1VEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1VEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1VEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1VEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1VEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1VEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1VEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1VEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1VEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1VEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1VEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1VEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1VEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1VEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1VEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1VEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1VEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1VEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1VEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1VEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1VEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1VEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1VEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1VEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1VEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1VEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1VEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1VEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1VEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1VEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1VEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1VEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1VEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1VEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1VEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1VEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1VEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1VEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1VEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1VEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1VEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1VEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1VEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1VEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1VEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1VEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1VEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1VEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1VEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1VEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1VEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1VEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1VEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1VEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1VEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC-NEXT:    ret i32 [[TMP78]]
+;
+; OzVEC-LABEL: @nopragma(
+; OzVEC-NEXT:  entry:
+; OzVEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; OzVEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; OzVEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; OzVEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; OzVEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; OzVEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; OzVEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; OzVEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; OzVEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; OzVEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; OzVEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; OzVEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; OzVEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; OzVEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; OzVEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; OzVEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; OzVEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; OzVEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; OzVEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; OzVEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; OzVEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; OzVEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; OzVEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; OzVEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; OzVEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; OzVEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; OzVEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; OzVEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; OzVEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; OzVEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; OzVEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; OzVEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; OzVEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; OzVEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; OzVEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; OzVEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; OzVEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; OzVEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; OzVEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; OzVEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; OzVEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; OzVEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; OzVEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; OzVEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; OzVEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; OzVEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; OzVEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; OzVEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; OzVEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; OzVEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; OzVEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; OzVEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; OzVEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; OzVEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; OzVEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; OzVEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; OzVEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; OzVEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; OzVEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; OzVEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; OzVEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; OzVEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; OzVEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC-NEXT:    ret i32 [[TMP78]]
+;
+; O1VEC2-LABEL: @nopragma(
+; O1VEC2-NEXT:  entry:
+; O1VEC2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; O1VEC2:       vector.ph:
+; O1VEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; O1VEC2:       vector.body:
+; O1VEC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; O1VEC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; O1VEC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; O1VEC2-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; O1VEC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; O1VEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; O1VEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; O1VEC2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; O1VEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; O1VEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
+; O1VEC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; O1VEC2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; O1VEC2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; O1VEC2:       middle.block:
+; O1VEC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, 64
+; O1VEC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; O1VEC2:       scalar.ph:
+; O1VEC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; O1VEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; O1VEC2:       for.body:
+; O1VEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1VEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[N]]
+; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1VEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; O1VEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; O1VEC2:       for.end:
+; O1VEC2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP10]]
+;
+; OzVEC2-LABEL: @nopragma(
+; OzVEC2-NEXT:  entry:
+; OzVEC2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; OzVEC2:       vector.ph:
+; OzVEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; OzVEC2:       vector.body:
+; OzVEC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; OzVEC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; OzVEC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; OzVEC2-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; OzVEC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; OzVEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; OzVEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; OzVEC2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; OzVEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; OzVEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
+; OzVEC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; OzVEC2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; OzVEC2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; OzVEC2:       middle.block:
+; OzVEC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, 64
+; OzVEC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; OzVEC2:       scalar.ph:
+; OzVEC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; OzVEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; OzVEC2:       for.body:
+; OzVEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; OzVEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[N]]
+; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; OzVEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; OzVEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; OzVEC2:       for.end:
+; OzVEC2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP10]]
+;
+; O3DIS-LABEL: @nopragma(
+; O3DIS-NEXT:  entry:
+; O3DIS-NEXT:    br label [[FOR_BODY:%.*]]
+; O3DIS:       for.body:
+; O3DIS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O3DIS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3DIS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O3DIS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3DIS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O3DIS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; O3DIS-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; O3DIS:       for.end:
+; O3DIS-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %N
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %1 = load i32, i32* %a, align 4
+  ret i32 %1
+}
+
+define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+; O1-LABEL: @disabled(
+; O1-NEXT:  entry:
+; O1-NEXT:    br label [[FOR_BODY:%.*]]
+; O1:       for.body:
+; O1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O1:       for.end:
+; O1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    ret i32 [[TMP1]]
+;
+; O2-LABEL: @disabled(
+; O2-NEXT:  entry:
+; O2-NEXT:    br label [[FOR_BODY:%.*]]
+; O2:       for.body:
+; O2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O2:       for.end:
+; O2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP1]]
+;
+; O3-LABEL: @disabled(
+; O3-NEXT:  entry:
+; O3-NEXT:    br label [[FOR_BODY:%.*]]
+; O3:       for.body:
+; O3-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O3-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O3-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O3-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O3:       for.end:
+; O3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP1]]
+;
+; O3DEFAULT-LABEL: @disabled(
+; O3DEFAULT-NEXT:  entry:
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP1]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX_4]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX2_4]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = bitcast i32* [[ARRAYIDX_8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = bitcast i32* [[ARRAYIDX2_8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX_12]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX2_12]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX_16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP19]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX2_16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP21]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX_20]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX2_20]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP24]], <4 x i32>* [[TMP25]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = bitcast i32* [[ARRAYIDX_24]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = load <4 x i32>, <4 x i32>* [[TMP26]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = add nsw <4 x i32> [[TMP27]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[ARRAYIDX2_24]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP28]], <4 x i32>* [[TMP29]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = bitcast i32* [[ARRAYIDX_28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = load <4 x i32>, <4 x i32>* [[TMP30]], align 4
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[TMP31]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = bitcast i32* [[ARRAYIDX2_28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP32]], <4 x i32>* [[TMP33]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[ARRAYIDX_32]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[ARRAYIDX2_32]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP36]], <4 x i32>* [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = bitcast i32* [[ARRAYIDX_36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = load <4 x i32>, <4 x i32>* [[TMP38]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = bitcast i32* [[ARRAYIDX2_36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP41]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[ARRAYIDX_40]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = load <4 x i32>, <4 x i32>* [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[TMP43]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = bitcast i32* [[ARRAYIDX2_40]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* [[TMP45]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = bitcast i32* [[ARRAYIDX_44]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP47:%.*]] = load <4 x i32>, <4 x i32>* [[TMP46]], align 4
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[ARRAYIDX2_44]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP48]], <4 x i32>* [[TMP49]], align 4
+; O3DEFAULT-NEXT:    [[TMP50:%.*]] = load i32, i32* [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP50]]
+;
+; Os-LABEL: @disabled(
+; Os-NEXT:  entry:
+; Os-NEXT:    br label [[FOR_BODY:%.*]]
+; Os:       for.body:
+; Os-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; Os-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; Os-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Os-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; Os-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; Os-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Os-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; Os-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; Os-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; Os:       for.end:
+; Os-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP1]]
+;
+; Oz-LABEL: @disabled(
+; Oz-NEXT:  entry:
+; Oz-NEXT:    br label [[FOR_BODY:%.*]]
+; Oz:       for.body:
+; Oz-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; Oz-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Oz-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; Oz-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Oz-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; Oz-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; Oz-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; Oz:       for.end:
+; Oz-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    ret i32 [[TMP1]]
+;
+; O1VEC-LABEL: @disabled(
+; O1VEC-NEXT:  entry:
+; O1VEC-NEXT:    br label [[FOR_BODY:%.*]]
+; O1VEC:       for.body:
+; O1VEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1VEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1VEC-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1VEC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1VEC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1VEC-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1VEC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O1VEC-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O1VEC:       for.end:
+; O1VEC-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC-NEXT:    ret i32 [[TMP1]]
+;
+; OzVEC-LABEL: @disabled(
+; OzVEC-NEXT:  entry:
+; OzVEC-NEXT:    br label [[FOR_BODY:%.*]]
+; OzVEC:       for.body:
+; OzVEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OzVEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; OzVEC-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; OzVEC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; OzVEC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; OzVEC-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; OzVEC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; OzVEC-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; OzVEC:       for.end:
+; OzVEC-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC-NEXT:    ret i32 [[TMP1]]
+;
+; O1VEC2-LABEL: @disabled(
+; O1VEC2-NEXT:  entry:
+; O1VEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; O1VEC2:       for.body:
+; O1VEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1VEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1VEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O1VEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !4
+; O1VEC2:       for.end:
+; O1VEC2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP1]]
+;
+; OzVEC2-LABEL: @disabled(
+; OzVEC2-NEXT:  entry:
+; OzVEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; OzVEC2:       for.body:
+; OzVEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; OzVEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; OzVEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; OzVEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !4
+; OzVEC2:       for.end:
+; OzVEC2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP1]]
+;
+; O3DIS-LABEL: @disabled(
+; O3DIS-NEXT:  entry:
+; O3DIS-NEXT:    br label [[FOR_BODY:%.*]]
+; O3DIS:       for.body:
+; O3DIS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O3DIS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3DIS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O3DIS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3DIS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O3DIS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O3DIS-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O3DIS:       for.end:
+; O3DIS-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %N
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 48
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:                                          ; preds = %for.body
+  %1 = load i32, i32* %a, align 4
+  ret i32 %1
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 1}
+!2 = !{!2, !3}
+!3 = !{!"llvm.loop.vectorize.enable", i1 0}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -vectorizer-min-trip-count=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: <4 x float>
+define void @trivial_loop(float* nocapture %a) nounwind uwtable optsize {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, 1.000000e+00
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,145 @@
+; REQUIRES: asserts
+; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i8 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %dataA, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = sext i8 %1 to i32
+; sources of the mul is sext\sext from i8 
+; use pmullw\sext seq.   
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i8
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i8 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i8
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i8 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-120
+; use pmullw\sext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -120, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\250
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 250, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-120
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -120, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\250
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i16 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %dataA, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx, align 1
+  %conv = sext i16 %0 to i32
+  %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx2, align 1
+  %conv3 = sext i16 %1 to i32
+; sources of the mul is sext\sext from i16 
+; use pmulhw\pmullw\pshuf seq.   
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i16
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i16 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i16
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i16 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-32000
+; use pmulhw\pmullw\sext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -32000, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\64000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 64000, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-32000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -32000, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\64000
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/no-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/no-vector.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/no-vector.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/no-vector.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; RUN: opt -S -mtriple=i386-unknown-freebsd -mcpu=i486 -loop-vectorize < %s
+
+define i32 @PR14639(i8* nocapture %s, i32 %len) nounwind {
+entry:
+  %cmp4 = icmp sgt i32 %len, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %r.05 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %s, i32 %i.06
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %xor = xor i32 %conv, %r.05
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %len
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
+  ret i32 %r.0.lcssa
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/no_fpmath.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/no_fpmath.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/no_fpmath.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/no_fpmath.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,109 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+
+; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations
+; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
+; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2)
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 {
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !3
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !9
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !10
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !10
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !9
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !9, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !15
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !9
+  %add = fadd double %a.08, %cond, !dbg !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !8
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !8
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !8, !llvm.loop !17
+}
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 {
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !19
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !22
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !23
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !23
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !22
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !24
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !22
+  %add = fadd double %a.08, %cond, !dbg !25
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !21
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !21, !llvm.loop !26
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!28}
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 1, !"PIC Level", i32 2}
+!2 = !{!"clang version 3.7.0"}
+!3 = !DILocation(line: 5, column: 20, scope: !4)
+!4 = distinct !DISubprogram(name: "cond_sum", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!5 = !DIFile(filename: "no_fpmath.c", directory: "")
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 5, column: 3, scope: !4)
+!9 = !DILocation(line: 6, column: 14, scope: !4)
+!10 = !DILocation(line: 9, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 6, column: 19, scope: !4)
+!16 = !DILocation(line: 6, column: 11, scope: !4)
+!17 = distinct !{!17, !18}
+!18 = !{!"llvm.loop.unroll.disable"}
+!19 = !DILocation(line: 16, column: 20, scope: !20)
+!20 = distinct !DISubprogram(name: "cond_sum_loop_hint", scope: !5, file: !5, line: 12, type: !6, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!21 = !DILocation(line: 16, column: 3, scope: !20)
+!22 = !DILocation(line: 17, column: 14, scope: !20)
+!23 = !DILocation(line: 20, column: 3, scope: !20)
+!24 = !DILocation(line: 17, column: 19, scope: !20)
+!25 = !DILocation(line: 17, column: 11, scope: !20)
+!26 = distinct !{!26, !27, !18}
+!27 = !{!"llvm.loop.vectorize.enable", i1 true}
+!28 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
+                             file: !5,
+                             isOptimized: true, flags: "-O2",
+                             splitDebugFilename: "abc.debug", emissionKind: 2)

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,113 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 | FileCheck %s
+
+; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300)
+; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
+; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) (hotness: 300)
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 !prof !29 {
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !3
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8, !prof !30
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !9
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !10
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !10
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !9
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !9, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !15
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !9
+  %add = fadd double %a.08, %cond, !dbg !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !8
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !8
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !8, !llvm.loop !17, !prof !31
+}
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 !prof !29{
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !19
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21, !prof !30
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !22
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !23
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !23
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !22
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !24
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !22
+  %add = fadd double %a.08, %cond, !dbg !25
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !21
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !21, !llvm.loop !26, !prof !31
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!28}
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 1, !"PIC Level", i32 2}
+!2 = !{!"clang version 3.7.0"}
+!3 = !DILocation(line: 5, column: 20, scope: !4)
+!4 = distinct !DISubprogram(name: "cond_sum", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!5 = !DIFile(filename: "no_fpmath.c", directory: "")
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 5, column: 3, scope: !4)
+!9 = !DILocation(line: 6, column: 14, scope: !4)
+!10 = !DILocation(line: 9, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 6, column: 19, scope: !4)
+!16 = !DILocation(line: 6, column: 11, scope: !4)
+!17 = distinct !{!17, !18}
+!18 = !{!"llvm.loop.unroll.disable"}
+!19 = !DILocation(line: 16, column: 20, scope: !20)
+!20 = distinct !DISubprogram(name: "cond_sum_loop_hint", scope: !5, file: !5, line: 12, type: !6, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!21 = !DILocation(line: 16, column: 3, scope: !20)
+!22 = !DILocation(line: 17, column: 14, scope: !20)
+!23 = !DILocation(line: 20, column: 3, scope: !20)
+!24 = !DILocation(line: 17, column: 19, scope: !20)
+!25 = !DILocation(line: 17, column: 11, scope: !20)
+!26 = distinct !{!26, !27, !18}
+!27 = !{!"llvm.loop.vectorize.enable", i1 true}
+!28 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
+                             file: !5,
+                             isOptimized: true, flags: "-O2",
+                             splitDebugFilename: "abc.debug", emissionKind: 2)
+!29 = !{!"function_entry_count", i64 3}
+!30 = !{!"branch_weights", i32 99, i32 1}
+!31 = !{!"branch_weights", i32 1, i32 99}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/optsize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/optsize.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/optsize.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/optsize.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; This test verifies that the loop vectorizer will NOT vectorize loops that
+; will produce a tail loop with the optimize for size or the minimize size
+; attributes. This is a target-dependent version of the test.
+; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s
+; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+ at tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_optsize() #0 {
+; CHECK-LABEL: @foo_optsize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
+; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { optsize }
+
+define i32 @foo_minsize() #1 {
+; CHECK-LABEL: @foo_minsize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
+; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !5
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #1 = { minsize }
+
+
+; We can't vectorize this one because we version for stride==1; even having TC
+; a multiple of VF.
+; CHECK-LABEL: @scev4stride1
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+; AUTOVF-LABEL: @scev4stride1
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.body:
+define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = mul nsw i32 %i.07, %k
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
+  store i32 %0, i32* %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 256
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  ret void
+}
+
+attributes #2 = { optsize }
+
+
+; PR39497
+; We can't vectorize this one because we version for overflow check and tiny
+; trip count leads to opt-for-size (which otherwise could fold the tail by
+; masking).
+; CHECK-LABEL: @main
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.cond:
+; AUTOVF-LABEL: @main
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.cond:
+define i32 @main() local_unnamed_addr {
+while.cond:
+  br label %for.cond
+
+for.cond:
+  %d.0 = phi i32 [ 0, %while.cond ], [ %add, %for.cond ]
+  %conv = and i32 %d.0, 65535
+  %cmp = icmp ult i32 %conv, 4
+  %add = add nuw nsw i32 %conv, 1
+  br i1 %cmp, label %for.cond, label %while.cond.loopexit
+
+while.cond.loopexit:
+  ret i32 0
+}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,114 @@
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64  < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64  -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64  -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX
+
+; extern int arr[8][8];
+; extern int arr2[8];
+;
+; void foo(int n)
+; {
+;   int i1, i2;
+;
+; #pragma clang loop vectorize(enable)
+;   for (i1 = 0; i1 < 8; i1++) {
+;     arr2[i1] = i1;
+;     for (i2 = 0; i2 < 8; i2++)
+;       arr[i2][i1] = i1 + n;
+;   }
+; }
+;
+
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0
+; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
+; CHECK: br label %[[InnerLoop:.+]]
+
+; CHECK: [[InnerLoop]]:
+; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
+; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
+; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+; AVX-LABEL: vector.ph:
+; AVX: %[[SplatVal:.*]] = insertelement <8 x i32> undef, i32 %n, i32 0
+; AVX: %[[Splat:.*]] = shufflevector <8 x i32> %[[SplatVal]], <8 x i32> undef, <8 x i32> zeroinitializer
+
+; AVX-LABEL: vector.body:
+; AVX: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; AVX: %[[VecInd:.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; AVX: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <8 x i64> %[[VecInd]]
+; AVX: %[[VecIndTr:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32>
+; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[VecIndTr]], <8 x i32*> %[[AAddr]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; AVX: %[[VecIndTr2:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32>
+; AVX: %[[StoreVal:.*]] = add nsw <8 x i32> %[[VecIndTr2]], %[[Splat]]
+; AVX: br label %[[InnerLoop:.+]]
+
+; AVX: [[InnerLoop]]:
+; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; AVX: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <8 x i64> %[[InnerPhi]], <8 x i64> %[[VecInd]]
+; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[StoreVal]], <8 x i32*> %[[AAddr2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true
+; AVX: %[[InnerPhiNext]] = add nuw nsw <8 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX: %[[VecCond:.*]] = icmp eq <8 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; AVX: %[[InnerCond:.*]] = extractelement <8 x i1> %[[VecCond]], i32 0
+; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; AVX: [[ForInc]]:
+; AVX: %[[IndNext]] = add i64 %[[Ind]], 8
+; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; AVX: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; AVX: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+ at arr2 = external global [8 x i32], align 16
+ at arr = external global [8 x [8 x i32]], align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21
+  %0 = trunc i64 %indvars.iv21 to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %1 = trunc i64 %indvars.iv21 to i32
+  %add = add nsw i32 %1, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i32 %add, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,50 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The parallel loop has been invalidated by the new memory accesses introduced
+; by reg2mem (Loop::isParallel() starts to return false). Ensure the loop is
+; now non-vectorizable.
+
+;CHECK-NOT: <4 x i32>
+define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  %indvars.iv.next.reg2mem = alloca i64
+  %indvars.iv.reg2mem = alloca i64
+  %"reg2mem alloca point" = bitcast i32 0 to i32
+  store i64 0, i64* %indvars.iv.reg2mem
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.for.body_crit_edge, %entry
+  %indvars.iv.reload = load i64, i64* %indvars.iv.reg2mem
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.reload
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !4
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.reload
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.access.group !4
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  store i32 %0, i32* %arrayidx4, align 4, !llvm.access.group !4
+  %indvars.iv.next = add i64 %indvars.iv.reload, 1
+  ; A new store without the parallel metadata here:
+  store i64 %indvars.iv.next, i64* %indvars.iv.next.reg2mem
+  %indvars.iv.next.reload1 = load i64, i64* %indvars.iv.next.reg2mem
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next.reload1
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.access.group !4
+  store i32 %2, i32* %arrayidx2, align 4, !llvm.access.group !4
+  %indvars.iv.next.reload = load i64, i64* %indvars.iv.next.reg2mem
+  %lftr.wideiv = trunc i64 %indvars.iv.next.reload to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !llvm.loop !3
+
+for.body.for.body_crit_edge:                      ; preds = %for.body
+  %indvars.iv.next.reload2 = load i64, i64* %indvars.iv.next.reg2mem
+  store i64 %indvars.iv.next.reload2, i64* %indvars.iv.reg2mem
+  br label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !4}}
+!4 = distinct !{}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/parallel-loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/parallel-loops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/parallel-loops.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/parallel-loops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,115 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; A tricky loop:
+;
+; void loop(int *a, int *b) {
+;    for (int i = 0; i < 512; ++i) {
+;        a[a[i]] = b[i];
+;        a[i] = b[i+1];
+;    }
+;}
+
+;CHECK-LABEL: @loop(
+;CHECK-NOT: <4 x i32>
+define void @loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  store i32 %0, i32* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4
+  store i32 %2, i32* %arrayidx2, align 4
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; The same loop with parallel loop metadata added to the loop branch
+; and the memory instructions.
+
+;CHECK-LABEL: @parallel_loop(
+;CHECK: <4 x i32>
+define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.access.group !13
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  ; This store might have originated from inlining a function with a parallel
+  ; loop. Refers to a list with the "original loop reference" (!4) also included.
+  store i32 %0, i32* %arrayidx4, align 4, !llvm.access.group !15
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.access.group !13
+  store i32 %2, i32* %arrayidx2, align 4, !llvm.access.group !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; The same loop with an illegal parallel loop metadata: the memory
+; accesses refer to a different loop's identifier.
+
+;CHECK-LABEL: @mixed_metadata(
+;CHECK-NOT: <4 x i32>
+
+define void @mixed_metadata(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !16
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.access.group !16
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  ; This refers to the loop marked with !7 which we are not in at the moment.
+  ; It should prevent detecting as a parallel loop.
+  store i32 %0, i32* %arrayidx4, align 4, !llvm.access.group !17
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.access.group !16
+  store i32 %2, i32* %arrayidx2, align 4, !llvm.access.group !16
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13, !15}}
+!4 = !{!4, !{!"llvm.loop.parallel_accesses", !14, !15}}
+!6 = !{!6, !{!"llvm.loop.parallel_accesses", !16}}
+!7 = !{!7, !{!"llvm.loop.parallel_accesses", !17}}
+!13 = distinct !{}
+!14 = distinct !{}
+!15 = distinct !{}
+!16 = distinct !{}
+!17 = distinct !{}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/powof2div.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/powof2div.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/powof2div.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/powof2div.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.anon = type { [100 x i32], i32, [100 x i32] }
+
+ at Foo = common global %struct.anon zeroinitializer, align 4
+
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x i32>, <4 x i32>*
+; CHECK: sdiv <4 x i32>
+; CHECK: store <4 x i32>
+
+define void @foo(){
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %div = sdiv i32 %0, 2
+  %arrayidx2 = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
+  store i32 %div, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/pr23997.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/pr23997.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/pr23997.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/pr23997.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Ensure that the 'inbounds' is preserved on the GEPs that feed the load and store in the loop.
+define void @foo(i8 addrspace(1)* align 8 dereferenceable_or_null(16), i8 addrspace(1)* align 8 dereferenceable_or_null(8), i64) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[PREHEADER:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[DOT10:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP0:%.*]], i64 16
+; CHECK-NEXT:    [[DOT11:%.*]] = bitcast i8 addrspace(1)* [[DOT10]] to i8 addrspace(1)* addrspace(1)*
+; CHECK-NEXT:    [[DOT12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 16
+; CHECK-NEXT:    [[DOT13:%.*]] = bitcast i8 addrspace(1)* [[DOT12]] to i8 addrspace(1)* addrspace(1)*
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2:%.*]], 1
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP2]], 1
+; CHECK-NEXT:    [[UMAX1:%.*]] = select i1 [[TMP4]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[UMAX1]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 [[TMP6]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[DOT10]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[DOT12]], [[SCEVGEP]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[UMAX]], -16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP7]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP8]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 4
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP9]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP10]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP11]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP12]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 12
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP13]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP14]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP15]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP16]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 4
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP17]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD6]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP18]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 8
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP19]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD7]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP20]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 12
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP21]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD8]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP22]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DOT18:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDVARS_IV3]]
+; CHECK-NEXT:    [[V:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT18]], align 8
+; CHECK-NEXT:    [[DOT20:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDVARS_IV3]]
+; CHECK-NEXT:    store i8 addrspace(1)* [[V]], i8 addrspace(1)* addrspace(1)* [[DOT20]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1
+; CHECK-NEXT:    [[DOT21:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT4]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[DOT21]], label [[LOOP]], label [[LOOPEXIT]], !llvm.loop !7
+; CHECK:       loopexit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %preheader
+
+preheader:                                       ; preds = %4
+  %.10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 16
+  %.11 = bitcast i8 addrspace(1)* %.10 to i8 addrspace(1)* addrspace(1)*
+  %.12 = getelementptr inbounds i8, i8 addrspace(1)* %1, i64 16
+  %.13 = bitcast i8 addrspace(1)* %.12 to i8 addrspace(1)* addrspace(1)*
+  br label %loop
+
+loop:
+  %indvars.iv3 = phi i64 [ 0, %preheader ], [ %indvars.iv.next4, %loop ]
+  %.18 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.13, i64 %indvars.iv3
+  %v = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.18, align 8
+  %.20 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.11, i64 %indvars.iv3
+  store i8 addrspace(1)* %v, i8 addrspace(1)* addrspace(1)* %.20, align 8
+  %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
+  %.21 = icmp ult i64 %indvars.iv.next4, %2
+  br i1 %.21, label %loop, label %loopexit
+
+loopexit:
+  ret void
+}
+
+attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
+
+!0 = !{i32 0, i32 2147483646}
+!1 = !{}

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/pr34438.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/pr34438.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/pr34438.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/pr34438.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; PR34438
+; Loop has a short trip count of 8 iterations. It should be vectorized because no runtime checks or tail loop are necessary.
+; Two cases tested AVX (MaxVF=8 = TripCount) and AVX512 (MaxVF=16 > TripCount)
+
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=skylake-avx512 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @small_tc(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @small_tc
+; CHECK:    load <8 x float>, <8 x float>*
+; CHECK:    fadd fast <8 x float>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !5
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !5
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !5}}
+!4 = !{!4}
+!5 = distinct !{}




More information about the llvm-commits mailing list