[llvm] r358552 - Revert "Temporarily Revert "Add basic loop fusion pass.""

Tue Apr 16 21:53:01 PDT 2019

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/operandorder.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/operandorder.ll?rev=358552&view=auto
==============================================================================

--- llvm/trunk/test/Transforms/SLPVectorizer/X86/operandorder.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/operandorder.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,474 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -instcombine -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+
+; Make sure we order the operands of commutative operations so that we get
+; bigger vectorizable trees.
+
+define void @shuffle_operands1(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @shuffle_operands1(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %v0_1, %v1
+  %v1_2 = fadd double %v2, %v0_2
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+  ret void
+}
+
+define void @vecload_vs_broadcast(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @vecload_vs_broadcast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %v0_1, %p
+  %v1_2 = fadd double %v0_1, %v0_2
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+define void @vecload_vs_broadcast2(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @vecload_vs_broadcast2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %p, %v0_1
+  %v1_2 = fadd double %v0_2, %v0_1
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+define void @vecload_vs_broadcast3(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @vecload_vs_broadcast3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %p, %v0_1
+  %v1_2 = fadd double %v0_1, %v0_2
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+define void @shuffle_preserve_broadcast4(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @shuffle_preserve_broadcast4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
+; CHECK-NEXT:    [[V0_1:%.*]] = load double, double* [[FROM]], align 4
+; CHECK-NEXT:    [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0_2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %v0_2, %v0_1
+  %v1_2 = fadd double %p, %v0_1
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+define void @vecload_vs_broadcast5(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @vecload_vs_broadcast5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %v0_1, %v0_2
+  %v1_2 = fadd double %p, %v0_1
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+
+define void @shuffle_preserve_broadcast6(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @shuffle_preserve_broadcast6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
+; CHECK-NEXT:    [[V0_1:%.*]] = load double, double* [[FROM]], align 4
+; CHECK-NEXT:    [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %v0_1, %v0_2
+  %v1_2 = fadd double %v0_1, %p
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+; Make sure we don't scramble operands when we reorder them and destroy
+; 'good' source order.
+
+ at a = common global [32000 x float] zeroinitializer, align 16
+
+define void @good_load_order() {
+; CHECK-LABEL: @good_load_order(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i32 0, i32 0), align 16
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
+; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    store float [[MUL45]], float* [[ARRAYIDX31]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %0 = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i64 0, i64 0), align 16
+  br label %for.body3
+
+for.body3:
+  %1 = phi float [ %0, %for.cond1.preheader ], [ %10, %for.body3 ]
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %2 = add nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %2
+  %3 = load float, float* %arrayidx, align 4
+  %arrayidx5 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv
+  %mul6 = fmul float %3, %1
+  store float %mul6, float* %arrayidx5, align 4
+  %4 = add nsw i64 %indvars.iv, 2
+  %arrayidx11 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %4
+  %5 = load float, float* %arrayidx11, align 4
+  %mul15 = fmul float %5, %3
+  store float %mul15, float* %arrayidx, align 4
+  %6 = add nsw i64 %indvars.iv, 3
+  %arrayidx21 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %6
+  %7 = load float, float* %arrayidx21, align 4
+  %mul25 = fmul float %7, %5
+  store float %mul25, float* %arrayidx11, align 4
+  %8 = add nsw i64 %indvars.iv, 4
+  %arrayidx31 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %8
+  %9 = load float, float* %arrayidx31, align 4
+  %mul35 = fmul float %9, %7
+  store float %mul35, float* %arrayidx21, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %arrayidx41 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv.next
+  %10 = load float, float* %arrayidx41, align 4
+  %mul45 = fmul float %10, %9
+  store float %mul45, float* %arrayidx31, align 4
+  %11 = trunc i64 %indvars.iv.next to i32
+  %cmp2 = icmp slt i32 %11, 31995
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.end:
+  ret void
+}
+
+; Check vectorization of following code for double data type-
+;  c[0] = a[0]+b[0];
+;  c[1] = b[1]+a[1]; // swapped b[1] and a[1]
+
+define void @load_reorder_double(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b){
+; CHECK-LABEL: @load_reorder_double(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load double, double* %a
+  %2 = load double, double* %b
+  %3 = fadd double %1, %2
+  store double %3, double* %c
+  %4 = getelementptr inbounds double, double* %b, i64 1
+  %5 = load double, double* %4
+  %6 = getelementptr inbounds double, double* %a, i64 1
+  %7 = load double, double* %6
+  %8 = fadd double %5, %7
+  %9 = getelementptr inbounds double, double* %c, i64 1
+  store double %8, double* %9
+  ret void
+}
+
+; Check vectorization of following code for float data type-
+;  c[0] = a[0]+b[0];
+;  c[1] = b[1]+a[1]; // swapped b[1] and a[1]
+;  c[2] = a[2]+b[2];
+;  c[3] = a[3]+b[3];
+
+define void @load_reorder_float(float* nocapture %c, float* noalias nocapture readonly %a, float* noalias nocapture readonly %b){
+; CHECK-LABEL: @load_reorder_float(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[C:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load float, float* %a
+  %2 = load float, float* %b
+  %3 = fadd float %1, %2
+  store float %3, float* %c
+  %4 = getelementptr inbounds float, float* %b, i64 1
+  %5 = load float, float* %4
+  %6 = getelementptr inbounds float, float* %a, i64 1
+  %7 = load float, float* %6
+  %8 = fadd float %5, %7
+  %9 = getelementptr inbounds float, float* %c, i64 1
+  store float %8, float* %9
+  %10 = getelementptr inbounds float, float* %a, i64 2
+  %11 = load float, float* %10
+  %12 = getelementptr inbounds float, float* %b, i64 2
+  %13 = load float, float* %12
+  %14 = fadd float %11, %13
+  %15 = getelementptr inbounds float, float* %c, i64 2
+  store float %14, float* %15
+  %16 = getelementptr inbounds float, float* %a, i64 3
+  %17 = load float, float* %16
+  %18 = getelementptr inbounds float, float* %b, i64 3
+  %19 = load float, float* %18
+  %20 = fadd float %17, %19
+  %21 = getelementptr inbounds float, float* %c, i64 3
+  store float %20, float* %21
+  ret void
+}
+
+; Check we properly reorder the below code so that it gets vectorized optimally-
+; a[0] = (b[0]+c[0])+d[0];
+; a[1] = d[1]+(b[1]+c[1]);
+; a[2] = (b[2]+c[2])+d[2];
+; a[3] = (b[3]+c[3])+d[3];
+
+define void @opcode_reorder(float* noalias nocapture %a, float* noalias nocapture readonly %b, float* noalias nocapture readonly %c,float* noalias nocapture readonly %d) {
+; CHECK-LABEL: @opcode_reorder(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[C:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[D:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load float, float* %b
+  %2 = load float, float* %c
+  %3 = fadd float %1, %2
+  %4 = load float, float* %d
+  %5 = fadd float %3, %4
+  store float %5, float* %a
+  %6 = getelementptr inbounds float, float* %d, i64 1
+  %7 = load float, float* %6
+  %8 = getelementptr inbounds float, float* %b, i64 1
+  %9 = load float, float* %8
+  %10 = getelementptr inbounds float, float* %c, i64 1
+  %11 = load float, float* %10
+  %12 = fadd float %9, %11
+  %13 = fadd float %7, %12
+  %14 = getelementptr inbounds float, float* %a, i64 1
+  store float %13, float* %14
+  %15 = getelementptr inbounds float, float* %b, i64 2
+  %16 = load float, float* %15
+  %17 = getelementptr inbounds float, float* %c, i64 2
+  %18 = load float, float* %17
+  %19 = fadd float %16, %18
+  %20 = getelementptr inbounds float, float* %d, i64 2
+  %21 = load float, float* %20
+  %22 = fadd float %19, %21
+  %23 = getelementptr inbounds float, float* %a, i64 2
+  store float %22, float* %23
+  %24 = getelementptr inbounds float, float* %b, i64 3
+  %25 = load float, float* %24
+  %26 = getelementptr inbounds float, float* %c, i64 3
+  %27 = load float, float* %26
+  %28 = fadd float %25, %27
+  %29 = getelementptr inbounds float, float* %d, i64 3
+  %30 = load float, float* %29
+  %31 = fadd float %28, %30
+  %32 = getelementptr inbounds float, float* %a, i64 3
+  store float %31, float* %32
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/opt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/opt.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/opt.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/opt.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=SLP
+; RUN: opt < %s -O3 -disable-slp-vectorization -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSLP
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure we can disable slp vectorization in opt.
+
+define void @test1(double* %a, double* %b, double* %c) {
+; SLP-LABEL: @test1(
+; SLP-NEXT:  entry:
+; SLP-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; SLP-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; SLP-NEXT:    [[TMP2:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; SLP-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; SLP-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; SLP-NEXT:    [[TMP5:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; SLP-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; SLP-NEXT:    ret void
+;
+; NOSLP-LABEL: @test1(
+; NOSLP-NEXT:  entry:
+; NOSLP-NEXT:    [[I0:%.*]] = load double, double* [[A:%.*]], align 8
+; NOSLP-NEXT:    [[I1:%.*]] = load double, double* [[B:%.*]], align 8
+; NOSLP-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
+; NOSLP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; NOSLP-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; NOSLP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; NOSLP-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; NOSLP-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; NOSLP-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
+; NOSLP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; NOSLP-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; NOSLP-NEXT:    ret void
+;
+entry:
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/ordering.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/ordering.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/ordering.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/ordering.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @updateModelQPFrame(i32 %m_Bits) {
+; CHECK-LABEL: @updateModelQPFrame(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load double, double* undef, align 8
+  %mul = fmul double undef, %0
+  %mul2 = fmul double undef, %mul
+  %mul4 = fmul double %0, %mul2
+  %mul5 = fmul double undef, 4.000000e+00
+  %mul7 = fmul double undef, %mul5
+  %conv = sitofp i32 %m_Bits to double
+  %mul8 = fmul double %conv, %mul7
+  %add = fadd double %mul4, %mul8
+  %cmp11 = fcmp olt double %add, 0.000000e+00
+  ret void
+}
+
+declare i8* @objc_msgSend(i8*, i8*, ...)
+declare i32 @personality_v0(...)
+
+define void @invoketest() personality i8* bitcast (i32 (...)* @personality_v0 to i8*) {
+; CHECK-LABEL: @invoketest(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[CALL49:%.*]] = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+; CHECK-NEXT:    to label [[COND_TRUE54:%.*]] unwind label [[LPAD:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[CALL51:%.*]] = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+; CHECK-NEXT:    to label [[COND_FALSE57:%.*]] unwind label [[LPAD]]
+; CHECK:       cond.true54:
+; CHECK-NEXT:    [[CALL56:%.*]] = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+; CHECK-NEXT:    to label [[COND_END60:%.*]] unwind label [[LPAD]]
+; CHECK:       cond.false57:
+; CHECK-NEXT:    [[CALL59:%.*]] = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+; CHECK-NEXT:    to label [[COND_END60]] unwind label [[LPAD]]
+; CHECK:       cond.end60:
+; CHECK-NEXT:    br i1 undef, label [[IF_END98:%.*]], label [[IF_THEN63:%.*]]
+; CHECK:       if.then63:
+; CHECK-NEXT:    br label [[IF_END98]]
+; CHECK:       lpad:
+; CHECK-NEXT:    [[L:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    resume { i8*, i32 } [[L]]
+; CHECK:       if.end98:
+; CHECK-NEXT:    br label [[IF_END99:%.*]]
+; CHECK:       if.end99:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %cond.true, label %cond.false
+
+cond.true:
+  %call49 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+  to label %cond.true54 unwind label %lpad
+
+cond.false:
+  %call51 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+  to label %cond.false57 unwind label %lpad
+
+cond.true54:
+  %call56 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+  to label %cond.end60 unwind label %lpad
+
+cond.false57:
+  %call59 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+  to label %cond.end60 unwind label %lpad
+
+cond.end60:
+  %cond126 = phi double [ %call49, %cond.true54 ], [ %call51, %cond.false57 ]
+  %cond61 = phi double [ %call56, %cond.true54 ], [ %call59, %cond.false57 ]
+  br i1 undef, label %if.end98, label %if.then63
+
+if.then63:
+  %conv69 = fptrunc double undef to float
+  %conv70 = fpext float %conv69 to double
+  %div71 = fdiv double %cond126, %conv70
+  %conv78 = fptrunc double undef to float
+  %conv79 = fpext float %conv78 to double
+  %div80 = fdiv double %cond61, %conv79
+  br label %if.end98
+
+lpad:
+  %l = landingpad { i8*, i32 }
+  cleanup
+  resume { i8*, i32 } %l
+
+if.end98:
+  %dimensionsResult.sroa.0.0 = phi double [ %div71, %if.then63 ], [ %cond126, %cond.end60 ]
+  %dimensionsResult.sroa.6.0 = phi double [ %div80, %if.then63 ], [ %cond61, %cond.end60 ]
+  br label %if.end99
+
+if.end99:
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/partail.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/partail.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/partail.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/partail.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define void @get_block(i32 %y_pos) local_unnamed_addr #0 {
+; CHECK-LABEL: @get_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LAND_LHS_TRUE:%.*]]
+; CHECK:       land.lhs.true:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end:
+; CHECK-NEXT:    [[SUB14:%.*]] = sub nsw i32 [[Y_POS:%.*]], undef
+; CHECK-NEXT:    [[SHR15:%.*]] = ashr i32 [[SUB14]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[SHR15]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[SUB14]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], <i32 0, i32 -1, i32 -5, i32 -9>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[SHR15]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 undef, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 undef, i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 undef, i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], undef
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> undef
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i32> [[TMP9]] to <4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP19]]
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br label %land.lhs.true
+
+land.lhs.true:                                    ; preds = %entry
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  unreachable
+
+if.end:                                           ; preds = %land.lhs.true
+  %sub14 = sub nsw i32 %y_pos, undef
+  %shr15 = ashr i32 %sub14, 2
+  %cmp.i.i = icmp sgt i32 %shr15, 0
+  %cond.i.i = select i1 %cmp.i.i, i32 %shr15, i32 0
+  %cmp.i4.i = icmp slt i32 %cond.i.i, undef
+  %cond.i5.i = select i1 %cmp.i4.i, i32 %cond.i.i, i32 undef
+  %idxprom30 = sext i32 %cond.i5.i to i64
+  %arrayidx31 = getelementptr inbounds i16*, i16** undef, i64 %idxprom30
+  %cmp.i.i.1 = icmp sgt i32 %sub14, -1
+  %cond.i.i.1 = select i1 %cmp.i.i.1, i32 undef, i32 0
+  %cmp.i4.i.1 = icmp slt i32 %cond.i.i.1, undef
+  %cond.i5.i.1 = select i1 %cmp.i4.i.1, i32 %cond.i.i.1, i32 undef
+  %idxprom30.1 = sext i32 %cond.i5.i.1 to i64
+  %arrayidx31.1 = getelementptr inbounds i16*, i16** undef, i64 %idxprom30.1
+  %cmp.i.i.2 = icmp sgt i32 %sub14, -5
+  %cond.i.i.2 = select i1 %cmp.i.i.2, i32 undef, i32 0
+  %cmp.i4.i.2 = icmp slt i32 %cond.i.i.2, undef
+  %cond.i5.i.2 = select i1 %cmp.i4.i.2, i32 %cond.i.i.2, i32 undef
+  %idxprom30.2 = sext i32 %cond.i5.i.2 to i64
+  %arrayidx31.2 = getelementptr inbounds i16*, i16** undef, i64 %idxprom30.2
+  %cmp.i.i.3 = icmp sgt i32 %sub14, -9
+  %cond.i.i.3 = select i1 %cmp.i.i.3, i32 undef, i32 0
+  %cmp.i4.i.3 = icmp slt i32 %cond.i.i.3, undef
+  %cond.i5.i.3 = select i1 %cmp.i4.i.3, i32 %cond.i.i.3, i32 undef
+  %idxprom30.3 = sext i32 %cond.i5.i.3 to i64
+  %arrayidx31.3 = getelementptr inbounds i16*, i16** undef, i64 %idxprom30.3
+  unreachable
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/phi.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/phi.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/phi.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,341 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+;int foo(double *A, int k) {
+;  double A0;
+;  double A1;
+;  if (k) {
+;    A0 = 3;
+;    A1 = 5;
+;  } else {
+;    A0 = A[10];
+;    A1 = A[11];
+;  }
+;  A[0] = A0;
+;  A[1] = A1;
+;}
+
+
+define i32 @foo(double* nocapture %A, i32 %k) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[K:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_ELSE:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 10
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[IF_ELSE]] ], [ <double 3.000000e+00, double 5.000000e+00>, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %tobool = icmp eq i32 %k, 0
+  br i1 %tobool, label %if.else, label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds double, double* %A, i64 10
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds double, double* %A, i64 11
+  %1 = load double, double* %arrayidx1, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.else
+  %A0.0 = phi double [ %0, %if.else ], [ 3.000000e+00, %entry ]
+  %A1.0 = phi double [ %1, %if.else ], [ 5.000000e+00, %entry ]
+  store double %A0.0, double* %A, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %A, i64 1
+  store double %A1.0, double* %arrayidx3, align 8
+  ret i32 undef
+}
+
+
+;int foo(double * restrict B,  double * restrict A, int n, int m) {
+;  double R=A[1];
+;  double G=A[0];
+;  for (int i=0; i < 100; i++) {
+;    R += 10;
+;    G += 10;
+;    R *= 4;
+;    G *= 4;
+;    R += 4;
+;    G += 4;
+;  }
+;  B[0] = G;
+;  B[1] = R;
+;  return 0;
+;}
+
+define i32 @foo2(double* noalias nocapture %B, double* noalias nocapture %A, i32 %n, i32 %m) #0 {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %arrayidx = getelementptr inbounds double, double* %A, i64 1
+  %0 = load double, double* %arrayidx, align 8
+  %1 = load double, double* %A, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.019 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %G.018 = phi double [ %1, %entry ], [ %add5, %for.body ]
+  %R.017 = phi double [ %0, %entry ], [ %add4, %for.body ]
+  %add = fadd double %R.017, 1.000000e+01
+  %add2 = fadd double %G.018, 1.000000e+01
+  %mul = fmul double %add, 4.000000e+00
+  %mul3 = fmul double %add2, 4.000000e+00
+  %add4 = fadd double %mul, 4.000000e+00
+  %add5 = fadd double %mul3, 4.000000e+00
+  %inc = add nsw i32 %i.019, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store double %add5, double* %B, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %B, i64 1
+  store double %add4, double* %arrayidx7, align 8
+  ret i32 0
+}
+
+; float foo3(float *A) {
+;
+;   float R = A[0];
+;   float G = A[1];
+;   float B = A[2];
+;   float Y = A[3];
+;   float P = A[4];
+;   for (int i=0; i < 121; i+=3) {
+;     R+=A[i+0]*7;
+;     G+=A[i+1]*8;
+;     B+=A[i+2]*9;
+;     Y+=A[i+3]*10;
+;     P+=A[i+4]*11;
+;   }
+;
+;   return R+G+B+Y+P;
+; }
+
+define float @foo3(float* nocapture readonly %A) #0 {
+; CHECK-LABEL: @foo3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE]], i32 3
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
+; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]]
+; CHECK-NEXT:    ret float [[ADD31]]
+;
+entry:
+  %0 = load float, float* %A, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %A, i64 1
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 2
+  %2 = load float, float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %A, i64 3
+  %3 = load float, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %A, i64 4
+  %4 = load float, float* %arrayidx4, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %P.056 = phi float [ %4, %entry ], [ %add26, %for.body ]
+  %Y.055 = phi float [ %3, %entry ], [ %add21, %for.body ]
+  %B.054 = phi float [ %2, %entry ], [ %add16, %for.body ]
+  %G.053 = phi float [ %1, %entry ], [ %add11, %for.body ]
+  %R.052 = phi float [ %0, %entry ], [ %add6, %for.body ]
+  %5 = phi float [ %1, %entry ], [ %11, %for.body ]
+  %6 = phi float [ %0, %entry ], [ %9, %for.body ]
+  %mul = fmul float %6, 7.000000e+00
+  %add6 = fadd float %R.052, %mul
+  %mul10 = fmul float %5, 8.000000e+00
+  %add11 = fadd float %G.053, %mul10
+  %7 = add nsw i64 %indvars.iv, 2
+  %arrayidx14 = getelementptr inbounds float, float* %A, i64 %7
+  %8 = load float, float* %arrayidx14, align 4
+  %mul15 = fmul float %8, 9.000000e+00
+  %add16 = fadd float %B.054, %mul15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx19 = getelementptr inbounds float, float* %A, i64 %indvars.iv.next
+  %9 = load float, float* %arrayidx19, align 4
+  %mul20 = fmul float %9, 1.000000e+01
+  %add21 = fadd float %Y.055, %mul20
+  %10 = add nsw i64 %indvars.iv, 4
+  %arrayidx24 = getelementptr inbounds float, float* %A, i64 %10
+  %11 = load float, float* %arrayidx24, align 4
+  %mul25 = fmul float %11, 1.100000e+01
+  %add26 = fadd float %P.056, %mul25
+  %12 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %12, 121
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add28 = fadd float %add6, %add11
+  %add29 = fadd float %add28, %add16
+  %add30 = fadd float %add29, %add21
+  %add31 = fadd float %add30, %add26
+  ret float %add31
+}
+
+; Make sure the order of phi nodes of different types does not prevent
+; vectorization of same typed phi nodes.
+define float @sort_phi_type(float* nocapture readonly %A) {
+; CHECK-LABEL: @sort_phi_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x float> [ <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>, [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP9]] = fmul <4 x float> [[TMP8]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP9]], i32 2
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP9]], i32 3
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP13]]
+; CHECK-NEXT:    ret float [[ADD31]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %Y = phi float [ 1.000000e+01, %entry ], [ %mul10, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %B = phi float [ 1.000000e+01, %entry ], [ %mul15, %for.body ]
+  %G = phi float [ 1.000000e+01, %entry ], [ %mul20, %for.body ]
+  %R = phi float [ 1.000000e+01, %entry ], [ %mul25, %for.body ]
+  %mul10 = fmul float %Y, 8.000000e+00
+  %mul15 = fmul float %B, 9.000000e+00
+  %mul20 = fmul float %R, 10.000000e+01
+  %mul25 = fmul float %G, 11.100000e+01
+  %indvars.iv.next = add nsw i64 %indvars.iv, 4
+  %cmp = icmp slt i64 %indvars.iv.next, 128
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add28 = fadd float 1.000000e+01, %mul10
+  %add29 = fadd float %mul10, %mul15
+  %add30 = fadd float %add29, %mul20
+  %add31 = fadd float %add30, %mul25
+  ret float %add31
+}
+
+define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1_0:%.*]] = load x86_fp80, x86_fp80* [[I1:%.*]], align 16
+; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr x86_fp80, x86_fp80* [[I1]], i64 1
+; CHECK-NEXT:    [[I1_1:%.*]] = load x86_fp80, x86_fp80* [[I1_GEP1]], align 16
+; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[I2_GEP0:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[I2:%.*]], i64 0
+; CHECK-NEXT:    [[I2_0:%.*]] = load x86_fp80, x86_fp80* [[I2_GEP0]], align 16
+; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[I2]], i64 1
+; CHECK-NEXT:    [[I2_1:%.*]] = load x86_fp80, x86_fp80* [[I2_GEP1]], align 16
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI0:%.*]] = phi x86_fp80 [ [[I1_0]], [[ENTRY:%.*]] ], [ [[I2_0]], [[THEN]] ]
+; CHECK-NEXT:    [[PHI1:%.*]] = phi x86_fp80 [ [[I1_1]], [[ENTRY]] ], [ [[I2_1]], [[THEN]] ]
+; CHECK-NEXT:    store x86_fp80 [[PHI0]], x86_fp80* [[O:%.*]], align 16
+; CHECK-NEXT:    [[O_GEP1:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[O]], i64 1
+; CHECK-NEXT:    store x86_fp80 [[PHI1]], x86_fp80* [[O_GEP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+; We disable the vectorization of x86_fp80 for now.
+
+entry:
+  %i1.0 = load x86_fp80, x86_fp80* %i1, align 16
+  %i1.gep1 = getelementptr x86_fp80, x86_fp80* %i1, i64 1
+  %i1.1 = load x86_fp80, x86_fp80* %i1.gep1, align 16
+  br i1 undef, label %then, label %end
+
+then:
+  %i2.gep0 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 0
+  %i2.0 = load x86_fp80, x86_fp80* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 1
+  %i2.1 = load x86_fp80, x86_fp80* %i2.gep1, align 16
+  br label %end
+
+end:
+  %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
+  %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
+  store x86_fp80 %phi0, x86_fp80* %o, align 16
+  %o.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %o, i64 1
+  store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/phi3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/phi3.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/phi3.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/phi3.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.GPar.0.16.26 = type { [0 x double], double }
+
+ at d = external global double, align 8
+
+declare %struct.GPar.0.16.26* @Rf_gpptr(...)
+
+define void @Rf_GReset() {
+; CHECK-LABEL: @Rf_GReset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* @d, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    br i1 icmp eq (%struct.GPar.0.16.26* (...)* inttoptr (i64 115 to %struct.GPar.0.16.26* (...)*), %struct.GPar.0.16.26* (...)* @Rf_gpptr), label [[IF_THEN:%.*]], label [[IF_END7:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN6:%.*]], label [[IF_END7]]
+; CHECK:       if.then6:
+; CHECK-NEXT:    br label [[IF_END7]]
+; CHECK:       if.end7:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub = fsub double -0.000000e+00, undef
+  %0 = load double, double* @d, align 8
+  %sub1 = fsub double -0.000000e+00, %0
+  br i1 icmp eq (%struct.GPar.0.16.26* (...)* inttoptr (i64 115 to %struct.GPar.0.16.26* (...)*), %struct.GPar.0.16.26* (...)* @Rf_gpptr), label %if.then, label %if.end7
+
+if.then:                                          ; preds = %entry
+  %sub2 = fsub double %sub, undef
+  %div.i = fdiv double %sub2, undef
+  %sub4 = fsub double %sub1, undef
+  %div.i16 = fdiv double %sub4, undef
+  %cmp = fcmp ogt double %div.i, %div.i16
+  br i1 %cmp, label %if.then6, label %if.end7
+
+if.then6:                                         ; preds = %if.then
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.then6, %if.then, %entry
+  %g.0 = phi double [ 0.000000e+00, %if.then6 ], [ %sub, %if.then ], [ %sub, %entry ]
+  ret void
+}
+
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-apple-macosx10.9.0 -S -o - | FileCheck %s
+
+target datalayout = "f64:64:64-v64:64:64"
+
+define void @test_phi_in_landingpad() personality i8*
+; CHECK-LABEL: @test_phi_in_landingpad(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    invoke void @foo()
+; CHECK-NEXT:    to label [[INNER:%.*]] unwind label [[LPAD:%.*]]
+; CHECK:       inner:
+; CHECK-NEXT:    invoke void @foo()
+; CHECK-NEXT:    to label [[DONE:%.*]] unwind label [[LPAD]]
+; CHECK:       lpad:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ undef, [[ENTRY:%.*]] ], [ undef, [[INNER]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    catch i8* null
+; CHECK-NEXT:    br label [[DONE]]
+; CHECK:       done:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ undef, [[INNER]] ], [ [[TMP0]], [[LPAD]] ]
+; CHECK-NEXT:    ret void
+;
+  bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @foo()
+  to label %inner unwind label %lpad
+
+inner:
+  %x0 = fsub double undef, undef
+  %y0 = fsub double undef, undef
+  invoke void @foo()
+  to label %done unwind label %lpad
+
+lpad:
+  %x1 = phi double [ undef, %entry ], [ undef, %inner ]
+  %y1 = phi double [ undef, %entry ], [ undef, %inner ]
+  landingpad { i8*, i32 } catch i8* null
+  br label %done
+
+done:
+  phi double [ %x0, %inner ], [ %x1, %lpad ]
+  phi double [ %y0, %inner ], [ %y1, %lpad ]
+  ret void
+}
+
+declare void @foo()
+
+declare i32 @__gxx_personality_v0(...)

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+; We purposely over-align f64 to 128bit here.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:128:128-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+
+define void @test(double* %i1, double* %i2, double* %o) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1_0:%.*]] = load double, double* [[I1:%.*]], align 16
+; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr double, double* [[I1]], i64 1
+; CHECK-NEXT:    [[I1_1:%.*]] = load double, double* [[I1_GEP1]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[I1_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1_1]], i32 1
+; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[I2_GEP0:%.*]] = getelementptr inbounds double, double* [[I2:%.*]], i64 0
+; CHECK-NEXT:    [[I2_0:%.*]] = load double, double* [[I2_GEP0]], align 16
+; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds double, double* [[I2]], i64 1
+; CHECK-NEXT:    [[I2_1:%.*]] = load double, double* [[I2_GEP1]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[I2_0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I2_1]], i32 1
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3]], [[THEN]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    store double [[TMP5]], double* [[O:%.*]], align 16
+; CHECK-NEXT:    [[O_GEP1:%.*]] = getelementptr inbounds double, double* [[O]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    store double [[TMP6]], double* [[O_GEP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+
+entry:
+  %i1.0 = load double, double* %i1, align 16
+  %i1.gep1 = getelementptr double, double* %i1, i64 1
+  %i1.1 = load double, double* %i1.gep1, align 16
+  br i1 undef, label %then, label %end
+
+then:
+  %i2.gep0 = getelementptr inbounds double, double* %i2, i64 0
+  %i2.0 = load double, double* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds double, double* %i2, i64 1
+  %i2.1 = load double, double* %i2.gep1, align 16
+  br label %end
+
+end:
+  %phi0 = phi double [ %i1.0, %entry ], [ %i2.0, %then ]
+  %phi1 = phi double [ %i1.1, %entry ], [ %i2.1, %then ]
+  store double %phi0, double* %o, align 16
+  %o.gep1 = getelementptr inbounds double, double* %o, i64 1
+  store double %phi1, double* %o.gep1, align 16
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/powof2div.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/powof2div.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/powof2div.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/powof2div.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+
+define void @powof2div_uniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
+; CHECK-LABEL: @powof2div_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* %b, align 4
+  %1 = load i32, i32* %c, align 4
+  %add = add nsw i32 %1, %0
+  %div = sdiv i32 %add, 2
+  store i32 %div, i32* %a, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 1
+  %2 = load i32, i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 1
+  %3 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %3, %2
+  %div6 = sdiv i32 %add5, 2
+  %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 1
+  store i32 %div6, i32* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 2
+  %4 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 2
+  %5 = load i32, i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %5, %4
+  %div11 = sdiv i32 %add10, 2
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 2
+  store i32 %div11, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %b, i64 3
+  %6 = load i32, i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 3
+  %7 = load i32, i32* %arrayidx14, align 4
+  %add15 = add nsw i32 %7, %6
+  %div16 = sdiv i32 %add15, 2
+  %arrayidx17 = getelementptr inbounds i32, i32* %a, i64 3
+  store i32 %div16, i32* %arrayidx17, align 4
+  ret void
+}
+
+define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
+; AVX1-LABEL: @powof2div_nonuniform(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
+; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[ADD]], 2
+; AVX1-NEXT:    store i32 [[DIV]], i32* [[A:%.*]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
+; AVX1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1
+; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
+; AVX1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
+; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; AVX1-NEXT:    store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; AVX1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
+; AVX1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
+; AVX1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
+; AVX1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
+; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8
+; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; AVX1-NEXT:    store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4
+; AVX1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; AVX1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; AVX1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; AVX1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
+; AVX1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
+; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16
+; AVX1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; AVX1-NEXT:    store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @powof2div_nonuniform(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; AVX2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; AVX2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; AVX2-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
+; AVX2-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; AVX2-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; AVX2-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; AVX2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; AVX2-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 4, i32 8, i32 16>
+; AVX2-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; AVX2-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* %b, align 4
+  %1 = load i32, i32* %c, align 4
+  %add = add nsw i32 %1, %0
+  %div = sdiv i32 %add, 2
+  store i32 %div, i32* %a, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 1
+  %2 = load i32, i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 1
+  %3 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %3, %2
+  %div6 = sdiv i32 %add5, 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 1
+  store i32 %div6, i32* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 2
+  %4 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 2
+  %5 = load i32, i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %5, %4
+  %div11 = sdiv i32 %add10, 8
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 2
+  store i32 %div11, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %b, i64 3
+  %6 = load i32, i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 3
+  %7 = load i32, i32* %arrayidx14, align 4
+  %add15 = add nsw i32 %7, %6
+  %div16 = sdiv i32 %add15, 16
+  %arrayidx17 = getelementptr inbounds i32, i32* %a, i64 3
+  store i32 %div16, i32* %arrayidx17, align 4
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16571.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16571.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16571.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16571.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; RUN: opt < %s -slp-vectorizer -S -mtriple=i686-pc-win32 -mcpu=corei7-avx
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
+target triple = "i686-pc-win32"
+
+define hidden fastcc void @"System.PrimitiveTypesParser.TryParseIEEE754<char>(char*,uint,double&)"() unnamed_addr {
+"@0":
+  br i1 undef, label %"@38.lr.ph", label %"@37"
+
+"@37":                                            ; preds = %"@38.lr.ph", %"@44", %"@0"
+  ret void
+
+"@44":                                            ; preds = %"@38.lr.ph"
+  %0 = add i64 undef, undef
+  %1 = add i32 %mainPartDigits.loc.0.ph45, 1
+  br i1 undef, label %"@38.lr.ph", label %"@37"
+
+"@38.lr.ph":                                      ; preds = %"@44", %"@0"
+  %mainDoublePart.loc.0.ph46 = phi i64 [ %0, %"@44" ], [ 0, %"@0" ]
+  %mainPartDigits.loc.0.ph45 = phi i32 [ %1, %"@44" ], [ 0, %"@0" ]
+  br i1 undef, label %"@44", label %"@37"
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16628.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16628.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16628.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16628.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+ at c = common global i32 0, align 4
+ at a = common global i16 0, align 2
+ at b = common global i16 0, align 2
+
+; Function Attrs: nounwind ssp uwtable
+define void @f() {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @g()
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @c, align 4
+; CHECK-NEXT:    [[LNOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[LNOT_EXT:%.*]] = zext i1 [[LNOT]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* @a, align 2
+; CHECK-NEXT:    [[LNOT2:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    [[LNOT_EXT3:%.*]] = zext i1 [[LNOT2]] to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[LNOT_EXT3]], [[LNOT_EXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], [[OR]]
+; CHECK-NEXT:    [[CONV4:%.*]] = zext i1 [[CMP]] to i16
+; CHECK-NEXT:    store i16 [[CONV4]], i16* @b, align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = tail call i32 (...) @g()
+  %0 = load i32, i32* @c, align 4
+  %lnot = icmp eq i32 %0, 0
+  %lnot.ext = zext i1 %lnot to i32
+  %1 = load i16, i16* @a, align 2
+  %lnot2 = icmp eq i16 %1, 0
+  %lnot.ext3 = zext i1 %lnot2 to i32
+  %or = or i32 %lnot.ext3, %lnot.ext
+  %cmp = icmp eq i32 %call, %or
+  %conv4 = zext i1 %cmp to i16
+  store i16 %conv4, i16* @b, align 2
+  ret void
+}
+
+declare i32 @g(...)

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16899.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16899.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16899.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/pr16899.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -slp-vectorizer -S -mtriple=i386--netbsd -mcpu=i486 | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386--netbsd"
+
+ at a = common global i32* null, align 4
+
+; Function Attrs: noreturn nounwind readonly
+define i32 @fn1() #0 {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** @a, align 4, !tbaa !0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4, !tbaa !4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4, !tbaa !4
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[C_0:%.*]] = phi i32 [ [[TMP2]], [[ENTRY:%.*]] ], [ [[ADD2:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[B_0:%.*]] = phi i32 [ [[TMP1]], [[ENTRY]] ], [ [[ADD:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[B_0]], [[C_0]]
+; CHECK-NEXT:    [[ADD2]] = add nsw i32 [[ADD]], 1
+; CHECK-NEXT:    br label [[DO_BODY]]
+;
+entry:
+  %0 = load i32*, i32** @a, align 4, !tbaa !4
+  %1 = load i32, i32* %0, align 4, !tbaa !5
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i32 1
+  %2 = load i32, i32* %arrayidx1, align 4, !tbaa !5
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %c.0 = phi i32 [ %2, %entry ], [ %add2, %do.body ]
+  %b.0 = phi i32 [ %1, %entry ], [ %add, %do.body ]
+  %add = add nsw i32 %b.0, %c.0
+  %add2 = add nsw i32 %add, 1
+  br label %do.body
+}
+
+attributes #0 = { noreturn nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = !{!"any pointer", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"int", !1}
+!4 = !{!0, !0, i64 0}
+!5 = !{!3, !3, i64 0}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/pr18060.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/pr18060.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/pr18060.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/pr18060.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=i386-pc-linux | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-pc-linux"
+
+; Function Attrs: nounwind
+define i32 @_Z16adjustFixupValueyj(i64 %Value, i32 %Kind) {
+; CHECK-LABEL: @_Z16adjustFixupValueyj(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i64 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[EXTRACT:%.*]] = lshr i64 [[VALUE]], 12
+; CHECK-NEXT:    [[EXTRACT_T6:%.*]] = trunc i64 [[EXTRACT]] to i32
+; CHECK-NEXT:    switch i32 [[KIND:%.*]], label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[RETURN:%.*]]
+; CHECK-NEXT:    i32 1, label [[RETURN]]
+; CHECK-NEXT:    i32 129, label [[SW_BB1:%.*]]
+; CHECK-NEXT:    i32 130, label [[SW_BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.default:
+; CHECK-NEXT:    call void @_Z25llvm_unreachable_internalv()
+; CHECK-NEXT:    unreachable
+; CHECK:       sw.bb1:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[VALUE]], 16
+; CHECK-NEXT:    [[EXTRACT_T5:%.*]] = trunc i64 [[SHR]] to i32
+; CHECK-NEXT:    [[EXTRACT7:%.*]] = lshr i64 [[VALUE]], 28
+; CHECK-NEXT:    [[EXTRACT_T8:%.*]] = trunc i64 [[EXTRACT7]] to i32
+; CHECK-NEXT:    br label [[SW_BB2]]
+; CHECK:       sw.bb2:
+; CHECK-NEXT:    [[VALUE_ADDR_0_OFF0:%.*]] = phi i32 [ [[EXTRACT_T]], [[ENTRY:%.*]] ], [ [[EXTRACT_T5]], [[SW_BB1]] ]
+; CHECK-NEXT:    [[VALUE_ADDR_0_OFF12:%.*]] = phi i32 [ [[EXTRACT_T6]], [[ENTRY]] ], [ [[EXTRACT_T8]], [[SW_BB1]] ]
+; CHECK-NEXT:    [[CONV6:%.*]] = and i32 [[VALUE_ADDR_0_OFF0]], 4095
+; CHECK-NEXT:    [[CONV4:%.*]] = shl i32 [[VALUE_ADDR_0_OFF12]], 16
+; CHECK-NEXT:    [[SHL:%.*]] = and i32 [[CONV4]], 983040
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[CONV6]]
+; CHECK-NEXT:    [[OR11:%.*]] = or i32 [[OR]], 8388608
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[OR11]], [[SW_BB2]] ], [ [[EXTRACT_T]], [[ENTRY]] ], [ [[EXTRACT_T]], [[ENTRY]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  %extract.t = trunc i64 %Value to i32
+  %extract = lshr i64 %Value, 12
+  %extract.t6 = trunc i64 %extract to i32
+  switch i32 %Kind, label %sw.default [
+  i32 0, label %return
+  i32 1, label %return
+  i32 129, label %sw.bb1
+  i32 130, label %sw.bb2
+  ]
+
+sw.default:                                       ; preds = %entry
+  call void @_Z25llvm_unreachable_internalv()
+  unreachable
+
+sw.bb1:                                           ; preds = %entry
+  %shr = lshr i64 %Value, 16
+  %extract.t5 = trunc i64 %shr to i32
+  %extract7 = lshr i64 %Value, 28
+  %extract.t8 = trunc i64 %extract7 to i32
+  br label %sw.bb2
+
+sw.bb2:                                           ; preds = %sw.bb1, %entry
+  %Value.addr.0.off0 = phi i32 [ %extract.t, %entry ], [ %extract.t5, %sw.bb1 ]
+  %Value.addr.0.off12 = phi i32 [ %extract.t6, %entry ], [ %extract.t8, %sw.bb1 ]
+  %conv6 = and i32 %Value.addr.0.off0, 4095
+  %conv4 = shl i32 %Value.addr.0.off12, 16
+  %shl = and i32 %conv4, 983040
+  %or = or i32 %shl, %conv6
+  %or11 = or i32 %or, 8388608
+  br label %return
+
+return:                                           ; preds = %sw.bb2, %entry, %entry
+  %retval.0 = phi i32 [ %or11, %sw.bb2 ], [ %extract.t, %entry ], [ %extract.t, %entry ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: noreturn
+declare void @_Z25llvm_unreachable_internalv()
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/pr19657.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/pr19657.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/pr19657.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/pr19657.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-max-reg-size=128 -S -mcpu=corei7-avx | FileCheck %s --check-prefix=V128
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(double* %x) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[X]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[X]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[X]] to <4 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x double> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[X]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP7]], <4 x double>* [[TMP8]], align 8
+; CHECK-NEXT:    ret void
+;
+; V128-LABEL: @foo(
+; V128-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 1
+; V128-NEXT:    [[TMP2:%.*]] = bitcast double* [[X]] to <2 x double>*
+; V128-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; V128-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP3]]
+; V128-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP3]]
+; V128-NEXT:    [[TMP6:%.*]] = bitcast double* [[X]] to <2 x double>*
+; V128-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; V128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[X]], i64 2
+; V128-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[X]], i64 3
+; V128-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
+; V128-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 8
+; V128-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], [[TMP10]]
+; V128-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], [[TMP10]]
+; V128-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
+; V128-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8
+; V128-NEXT:    ret void
+;
+  %1 = load double, double* %x, align 8
+  %2 = fadd double %1, %1
+  %3 = fadd double %2, %1
+  store double %3, double* %x, align 8
+  %4 = getelementptr inbounds double, double* %x, i64 1
+  %5 = load double, double* %4, align 8
+  %6 = fadd double %5, %5
+  %7 = fadd double %6, %5
+  store double %7, double* %4, align 8
+  %8 = getelementptr inbounds double, double* %x, i64 2
+  %9 = load double, double* %8, align 8
+  %10 = fadd double %9, %9
+  %11 = fadd double %10, %9
+  store double %11, double* %8, align 8
+  %12 = getelementptr inbounds double, double* %x, i64 3
+  %13 = load double, double* %12, align 8
+  %14 = fadd double %13, %13
+  %15 = fadd double %14, %13
+  store double %15, double* %12, align 8
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/pr23510.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/pr23510.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/pr23510.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/pr23510.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; PR23510
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at total = global i64 0, align 8
+
+define void @_Z3fooPml(i64* nocapture %a, i64 %i) {
+; CHECK-LABEL: @_Z3fooPml(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 4, i64 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* @total, align 8
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    store i64 [[ADD]], i64* @total, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <2 x i64> [[TMP5]], <i64 4, i64 4>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* @total, align 8
+; CHECK-NEXT:    [[ADD9:%.*]] = add i64 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    store i64 [[ADD9]], i64* @total, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = load i64, i64* %a, align 8
+  %shr = lshr i64 %tmp, 4
+  store i64 %shr, i64* %a, align 8
+  %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 1
+  %tmp1 = load i64, i64* %arrayidx1, align 8
+  %shr2 = lshr i64 %tmp1, 4
+  store i64 %shr2, i64* %arrayidx1, align 8
+  %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 %i
+  %tmp2 = load i64, i64* %arrayidx3, align 8
+  %tmp3 = load i64, i64* @total, align 8
+  %add = add i64 %tmp3, %tmp2
+  store i64 %add, i64* @total, align 8
+  %tmp4 = load i64, i64* %a, align 8
+  %shr5 = lshr i64 %tmp4, 4
+  store i64 %shr5, i64* %a, align 8
+  %tmp5 = load i64, i64* %arrayidx1, align 8
+  %shr7 = lshr i64 %tmp5, 4
+  store i64 %shr7, i64* %arrayidx1, align 8
+  %tmp6 = load i64, i64* %arrayidx3, align 8
+  %tmp7 = load i64, i64* @total, align 8
+  %add9 = add i64 %tmp7, %tmp6
+  store i64 %add9, i64* @total, align 8
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/pr27163.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/pr27163.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/pr27163.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/pr27163.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s | FileCheck %s
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc18.0.0"
+
+%struct.B = type { i64, i64 }
+
+define void @test1(%struct.B* %p) personality i32 (...)* @__CxxFrameHandler3 {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  invoke.cont:
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], %struct.B* [[P:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [[STRUCT_B]], %struct.B* [[P]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[GEP1]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[GEP1]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    invoke void @throw()
+; CHECK-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK:       catch.dispatch:
+; CHECK-NEXT:    [[CS:%.*]] = catchswitch within none [label %invoke.cont1] unwind label [[EHCLEANUP:%.*]]
+; CHECK:       invoke.cont1:
+; CHECK-NEXT:    [[CATCH:%.*]] = catchpad within [[CS]] [i8* null, i32 64, i8* null]
+; CHECK-NEXT:    invoke void @throw() [ "funclet"(token [[CATCH]]) ]
+; CHECK-NEXT:    to label [[UNREACHABLE]] unwind label [[EHCLEANUP]]
+; CHECK:       ehcleanup:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[TMP2]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ]
+; CHECK-NEXT:    [[CLEANUP:%.*]] = cleanuppad within none []
+; CHECK-NEXT:    call void @release(i64 [[PHI]]) [ "funclet"(token [[CLEANUP]]) ]
+; CHECK-NEXT:    cleanupret from [[CLEANUP]] unwind to caller
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+;
+invoke.cont:
+  %gep1 = getelementptr inbounds %struct.B, %struct.B* %p, i64 0, i32 0
+  %gep2 = getelementptr inbounds %struct.B, %struct.B* %p, i64 0, i32 1
+  %load1 = load i64, i64* %gep1, align 8
+  %load2 = load i64, i64* %gep2, align 8
+  store i64 %load1, i64* %gep1, align 8
+  store i64 %load2, i64* %gep2, align 8
+  invoke void @throw()
+  to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %invoke.cont
+  %cs = catchswitch within none [label %invoke.cont1] unwind label %ehcleanup
+
+invoke.cont1:                                     ; preds = %catch.dispatch
+  %catch = catchpad within %cs [i8* null, i32 64, i8* null]
+  invoke void @throw() [ "funclet"(token %catch) ]
+  to label %unreachable unwind label %ehcleanup
+
+ehcleanup:                                        ; preds = %invoke.cont1, %catch.dispatch
+  %phi = phi i64 [ %load1, %catch.dispatch ], [ 9, %invoke.cont1 ]
+  %cleanup = cleanuppad within none []
+  call void @release(i64 %phi) [ "funclet"(token %cleanup) ]
+  cleanupret from %cleanup unwind to caller
+
+unreachable:                                      ; preds = %invoke.cont1, %invoke.cont
+  unreachable
+}
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @throw()
+
+declare void @release(i64)

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/pr31599.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/pr31599.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/pr31599.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/pr31599.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define <2 x float> @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SOURCE:%.*]] = insertelement <2 x float> undef, float undef, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x float> [[SOURCE]], [[SOURCE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <2 x float> [[RES1]], float [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RES2]]
+;
+entry:
+  %source = insertelement <2 x float> undef, float undef, i32 0
+  %e0 = extractelement <2 x float> %source, i32 0
+  %e0.dup = extractelement <2 x float> %source, i32 0
+  %sub1 = fsub float %e0, %e0.dup
+  %e1 = extractelement <2 x float> %source, i32 1
+  %e1.dup = extractelement <2 x float> %source, i32 1
+  %sub2 = fsub float %e1, %e1.dup
+  %res1 = insertelement <2 x float> undef, float %sub1, i32 0
+  %res2 = insertelement <2 x float> %res1, float %sub2, i32 1
+  ret <2 x float> %res2
+}
+
+!llvm.ident = !{!0, !0}
+
+!0 = !{!"clang version 4.0.0 "}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/pr35497.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/pr35497.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/pr35497.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/pr35497.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+%class.1 = type { %class.2 }
+%class.2 = type { %"class.3" }
+%"class.3" = type { %"struct.1", i64 }
+%"struct.1" = type { [8 x i64] }
+
+$_ZN1C10SwitchModeEv = comdat any
+
+; Function Attrs: uwtable
+define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
+; CHECK-LABEL: @_ZN1C10SwitchModeEv(
+; CHECK-NEXT:  for.body.lr.ph.i:
+; CHECK-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
+; CHECK-NEXT:    store i64 [[OR_1]], i64* undef, align 8
+; CHECK-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
+; CHECK-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+; CHECK-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+for.body.lr.ph.i:
+  %or.1 = or i64 undef, 1
+  store i64 %or.1, i64* undef, align 8
+  %foo.1 = getelementptr inbounds %class.1, %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
+  %foo.3 = load i64, i64* %foo.1, align 8
+  %foo.2 = getelementptr inbounds %class.1, %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+  %foo.4 = load i64, i64* %foo.2, align 8
+  %bar5 = load i64, i64* undef, align 8
+  %and.2 = and i64 %or.1, %foo.3
+  %and.1 = and i64 %bar5, %foo.4
+  %bar3 = getelementptr inbounds %class.2, %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+  store i64 %and.2, i64* %bar3, align 8
+  %bar4 = getelementptr inbounds %class.2, %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
+  store i64 %and.1, i64* %bar4, align 8
+  ret void
+}
+
+; Function Attrs: norecurse nounwind uwtable
+define void @pr35497() local_unnamed_addr #0 {
+; CHECK-LABEL: @pr35497(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 undef, undef
+; CHECK-NEXT:    store i64 [[ADD]], i64* undef, align 1
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
+; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i64, i64* undef, align 1
+  %and = shl i64 %0, 2
+  %shl = and i64 %and, 20
+  %add = add i64 undef, undef
+  store i64 %add, i64* undef, align 1
+  %arrayidx2.1 = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+  %and.1 = shl i64 undef, 2
+  %shl.1 = and i64 %and.1, 20
+  %shr.1 = lshr i64 undef, 6
+  %add.1 = add nuw nsw i64 %shl, %shr.1
+  %arrayidx2.2 = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+  %shr.2 = lshr i64 undef, 6
+  %add.2 = add nuw nsw i64 %shl.1, %shr.2
+  %and.4 = shl i64 %add, 2
+  %shl.4 = and i64 %and.4, 20
+  %arrayidx2.5 = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+  store i64 %add.1, i64* %arrayidx2.5, align 1
+  %and.5 = shl nuw nsw i64 %add.1, 2
+  %shl.5 = and i64 %and.5, 20
+  %shr.5 = lshr i64 %add.1, 6
+  %add.5 = add nuw nsw i64 %shl.4, %shr.5
+  store i64 %add.5, i64* %arrayidx2.1, align 1
+  %arrayidx2.6 = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+  store i64 %add.2, i64* %arrayidx2.6, align 1
+  %shr.6 = lshr i64 %add.2, 6
+  %add.6 = add nuw nsw i64 %shl.5, %shr.6
+  store i64 %add.6, i64* %arrayidx2.2, align 1
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,607 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+
+; Check propagation of optional IR flags (PR20802). For a flag to
+; propagate from scalar instructions to their vector replacement,
+; *all* scalar instructions must have the flag.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define void @exact(i32* %x) {
+; CHECK-LABEL: @exact(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr exact <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = lshr exact i32 %load1, 1
+  %op2 = lshr exact i32 %load2, 1
+  %op3 = lshr exact i32 %load3, 1
+  %op4 = lshr exact i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @not_exact(i32* %x) {
+; CHECK-LABEL: @not_exact(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = lshr exact i32 %load1, 1
+  %op2 = lshr i32 %load2, 1
+  %op3 = lshr exact i32 %load3, 1
+  %op4 = lshr exact i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @nsw(i32* %x) {
+; CHECK-LABEL: @nsw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = add nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = add nsw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @not_nsw(i32* %x) {
+; CHECK-LABEL: @not_nsw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = add nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = add i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @nuw(i32* %x) {
+; CHECK-LABEL: @nuw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nuw i32 %load1, 1
+  %op2 = add nuw i32 %load2, 1
+  %op3 = add nuw i32 %load3, 1
+  %op4 = add nuw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @not_nuw(i32* %x) {
+; CHECK-LABEL: @not_nuw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nuw i32 %load1, 1
+  %op2 = add i32 %load2, 1
+  %op3 = add i32 %load3, 1
+  %op4 = add nuw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @nnan(float* %x) {
+; CHECK-LABEL: @nnan(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd nnan <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds float, float* %x, i64 0
+  %idx2 = getelementptr inbounds float, float* %x, i64 1
+  %idx3 = getelementptr inbounds float, float* %x, i64 2
+  %idx4 = getelementptr inbounds float, float* %x, i64 3
+
+  %load1 = load float, float* %idx1, align 4
+  %load2 = load float, float* %idx2, align 4
+  %load3 = load float, float* %idx3, align 4
+  %load4 = load float, float* %idx4, align 4
+
+  %op1 = fadd fast nnan float %load1, 1.0
+  %op2 = fadd nnan ninf float %load2, 1.0
+  %op3 = fadd nsz nnan float %load3, 1.0
+  %op4 = fadd arcp nnan float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+
+define void @not_nnan(float* %x) {
+; CHECK-LABEL: @not_nnan(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds float, float* %x, i64 0
+  %idx2 = getelementptr inbounds float, float* %x, i64 1
+  %idx3 = getelementptr inbounds float, float* %x, i64 2
+  %idx4 = getelementptr inbounds float, float* %x, i64 3
+
+  %load1 = load float, float* %idx1, align 4
+  %load2 = load float, float* %idx2, align 4
+  %load3 = load float, float* %idx3, align 4
+  %load4 = load float, float* %idx4, align 4
+
+  %op1 = fadd nnan float %load1, 1.0
+  %op2 = fadd ninf float %load2, 1.0
+  %op3 = fadd nsz float %load3, 1.0
+  %op4 = fadd arcp float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+
+define void @only_fast(float* %x) {
+; CHECK-LABEL: @only_fast(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds float, float* %x, i64 0
+  %idx2 = getelementptr inbounds float, float* %x, i64 1
+  %idx3 = getelementptr inbounds float, float* %x, i64 2
+  %idx4 = getelementptr inbounds float, float* %x, i64 3
+
+  %load1 = load float, float* %idx1, align 4
+  %load2 = load float, float* %idx2, align 4
+  %load3 = load float, float* %idx3, align 4
+  %load4 = load float, float* %idx4, align 4
+
+  %op1 = fadd fast nnan float %load1, 1.0
+  %op2 = fadd fast nnan ninf float %load2, 1.0
+  %op3 = fadd fast nsz nnan float %load3, 1.0
+  %op4 = fadd arcp nnan fast float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+
+define void @only_arcp(float* %x) {
+; CHECK-LABEL: @only_arcp(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd arcp <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds float, float* %x, i64 0
+  %idx2 = getelementptr inbounds float, float* %x, i64 1
+  %idx3 = getelementptr inbounds float, float* %x, i64 2
+  %idx4 = getelementptr inbounds float, float* %x, i64 3
+
+  %load1 = load float, float* %idx1, align 4
+  %load2 = load float, float* %idx2, align 4
+  %load3 = load float, float* %idx3, align 4
+  %load4 = load float, float* %idx4, align 4
+
+  %op1 = fadd fast float %load1, 1.0
+  %op2 = fadd fast float %load2, 1.0
+  %op3 = fadd fast float %load3, 1.0
+  %op4 = fadd arcp float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+
+define void @addsub_all_nsw(i32* %x) {
+; CHECK-LABEL: @addsub_all_nsw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = sub nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = sub nsw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @addsub_some_nsw(i32* %x) {
+; CHECK-LABEL: @addsub_some_nsw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = sub nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = sub i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @addsub_no_nsw(i32* %x) {
+; CHECK-LABEL: @addsub_no_nsw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add i32 %load1, 1
+  %op2 = sub nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = sub i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @fcmp_fast(double* %x) #1 {
+; CHECK-LABEL: @fcmp_fast(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast oge <2 x double> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP2]], <2 x double> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds double, double* %x, i64 0
+  %idx2 = getelementptr inbounds double, double* %x, i64 1
+
+  %load1 = load double, double* %idx1, align 8
+  %load2 = load double, double* %idx2, align 8
+
+  %cmp1 = fcmp fast oge double %load1, 0.000000e+00
+  %cmp2 = fcmp fast oge double %load2, 0.000000e+00
+
+  %sub1 = fsub fast double -0.000000e+00, %load1
+  %sub2 = fsub fast double -0.000000e+00, %load2
+
+  %sel1 = select i1 %cmp1, double %load1, double %sub1
+  %sel2 = select i1 %cmp2, double %load2, double %sub2
+
+  store double %sel1, double* %idx1, align 8
+  store double %sel2, double* %idx2, align 8
+
+  ret void
+}
+
+define void @fcmp_no_fast(double* %x) #1 {
+; CHECK-LABEL: @fcmp_no_fast(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp oge <2 x double> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP2]], <2 x double> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds double, double* %x, i64 0
+  %idx2 = getelementptr inbounds double, double* %x, i64 1
+
+  %load1 = load double, double* %idx1, align 8
+  %load2 = load double, double* %idx2, align 8
+
+  %cmp1 = fcmp fast oge double %load1, 0.000000e+00
+  %cmp2 = fcmp oge double %load2, 0.000000e+00
+
+  %sub1 = fsub fast double -0.000000e+00, %load1
+  %sub2 = fsub double -0.000000e+00, %load2
+
+  %sel1 = select i1 %cmp1, double %load1, double %sub1
+  %sel2 = select i1 %cmp2, double %load2, double %sub2
+
+  store double %sel1, double* %idx1, align 8
+  store double %sel2, double* %idx2, align 8
+
+  ret void
+}
+
+declare double @llvm.fabs.f64(double) nounwind readnone
+
+define void @call_fast(double* %x) {
+; CHECK-LABEL: @call_fast(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds double, double* %x, i64 0
+  %idx2 = getelementptr inbounds double, double* %x, i64 1
+
+  %load1 = load double, double* %idx1, align 8
+  %load2 = load double, double* %idx2, align 8
+
+  %call1 = tail call fast double @llvm.fabs.f64(double %load1) nounwind readnone
+  %call2 = tail call fast double @llvm.fabs.f64(double %load2) nounwind readnone
+
+  store double %call1, double* %idx1, align 8
+  store double %call2, double* %idx2, align 8
+
+  ret void
+}
+
+define void @call_no_fast(double* %x) {
+; CHECK-LABEL: @call_no_fast(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds double, double* %x, i64 0
+  %idx2 = getelementptr inbounds double, double* %x, i64 1
+
+  %load1 = load double, double* %idx1, align 8
+  %load2 = load double, double* %idx2, align 8
+
+  %call1 = tail call fast double @llvm.fabs.f64(double %load1) nounwind readnone
+  %call2 = tail call double @llvm.fabs.f64(double %load2) nounwind readnone
+
+  store double %call1, double* %idx1, align 8
+  store double %call2, double* %idx2, align 8
+
+  ret void
+}
+
+attributes #1 = { "target-features"="+avx" }

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -reassociate -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
+
+define signext i8 @Foo(<32 x i8>* %__v) {
+; CHECK-LABEL: @Foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, <32 x i8>* [[__V:%.*]], align 32
+; CHECK-NEXT:    [[ADD_I_1_I:%.*]] = add i8 undef, undef
+; CHECK-NEXT:    [[ADD_I_2_I:%.*]] = add i8 [[ADD_I_1_I]], undef
+; CHECK-NEXT:    [[ADD_I_3_I:%.*]] = add i8 [[ADD_I_2_I]], undef
+; CHECK-NEXT:    [[ADD_I_4_I:%.*]] = add i8 [[ADD_I_3_I]], undef
+; CHECK-NEXT:    [[ADD_I_5_I:%.*]] = add i8 [[ADD_I_4_I]], undef
+; CHECK-NEXT:    [[ADD_I_6_I:%.*]] = add i8 [[ADD_I_5_I]], undef
+; CHECK-NEXT:    [[ADD_I_7_I:%.*]] = add i8 [[ADD_I_6_I]], undef
+; CHECK-NEXT:    [[ADD_I_8_I:%.*]] = add i8 [[ADD_I_7_I]], undef
+; CHECK-NEXT:    [[ADD_I_9_I:%.*]] = add i8 [[ADD_I_8_I]], undef
+; CHECK-NEXT:    [[ADD_I_10_I:%.*]] = add i8 [[ADD_I_9_I]], undef
+; CHECK-NEXT:    [[ADD_I_11_I:%.*]] = add i8 [[ADD_I_10_I]], undef
+; CHECK-NEXT:    [[ADD_I_12_I:%.*]] = add i8 [[ADD_I_11_I]], undef
+; CHECK-NEXT:    [[ADD_I_13_I:%.*]] = add i8 [[ADD_I_12_I]], undef
+; CHECK-NEXT:    [[ADD_I_14_I:%.*]] = add i8 [[ADD_I_13_I]], undef
+; CHECK-NEXT:    [[ADD_I_15_I:%.*]] = add i8 [[ADD_I_14_I]], undef
+; CHECK-NEXT:    [[ADD_I_16_I:%.*]] = add i8 [[ADD_I_15_I]], undef
+; CHECK-NEXT:    [[ADD_I_17_I:%.*]] = add i8 [[ADD_I_16_I]], undef
+; CHECK-NEXT:    [[ADD_I_18_I:%.*]] = add i8 [[ADD_I_17_I]], undef
+; CHECK-NEXT:    [[ADD_I_19_I:%.*]] = add i8 [[ADD_I_18_I]], undef
+; CHECK-NEXT:    [[ADD_I_20_I:%.*]] = add i8 [[ADD_I_19_I]], undef
+; CHECK-NEXT:    [[ADD_I_21_I:%.*]] = add i8 [[ADD_I_20_I]], undef
+; CHECK-NEXT:    [[ADD_I_22_I:%.*]] = add i8 [[ADD_I_21_I]], undef
+; CHECK-NEXT:    [[ADD_I_23_I:%.*]] = add i8 [[ADD_I_22_I]], undef
+; CHECK-NEXT:    [[ADD_I_24_I:%.*]] = add i8 [[ADD_I_23_I]], undef
+; CHECK-NEXT:    [[ADD_I_25_I:%.*]] = add i8 [[ADD_I_24_I]], undef
+; CHECK-NEXT:    [[ADD_I_26_I:%.*]] = add i8 [[ADD_I_25_I]], undef
+; CHECK-NEXT:    [[ADD_I_27_I:%.*]] = add i8 [[ADD_I_26_I]], undef
+; CHECK-NEXT:    [[ADD_I_28_I:%.*]] = add i8 [[ADD_I_27_I]], undef
+; CHECK-NEXT:    [[ADD_I_29_I:%.*]] = add i8 [[ADD_I_28_I]], undef
+; CHECK-NEXT:    [[ADD_I_30_I:%.*]] = add i8 [[ADD_I_29_I]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <32 x i8> [[TMP0]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i8> [[BIN_RDX]], <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <32 x i8> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i8> [[BIN_RDX2]], <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <32 x i8> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i8> [[BIN_RDX4]], <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <32 x i8> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i8> [[BIN_RDX6]], <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <32 x i8> [[BIN_RDX6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <32 x i8> [[BIN_RDX8]], i32 0
+; CHECK-NEXT:    [[ADD_I_31_I:%.*]] = add i8 [[ADD_I_30_I]], undef
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+entry:
+  %0 = load <32 x i8>, <32 x i8>* %__v, align 32
+  %vecext.i.i.i = extractelement <32 x i8> %0, i64 0
+  %vecext.i.i.1.i = extractelement <32 x i8> %0, i64 1
+  %add.i.1.i = add i8 %vecext.i.i.1.i, %vecext.i.i.i
+  %vecext.i.i.2.i = extractelement <32 x i8> %0, i64 2
+  %add.i.2.i = add i8 %vecext.i.i.2.i, %add.i.1.i
+  %vecext.i.i.3.i = extractelement <32 x i8> %0, i64 3
+  %add.i.3.i = add i8 %vecext.i.i.3.i, %add.i.2.i
+  %vecext.i.i.4.i = extractelement <32 x i8> %0, i64 4
+  %add.i.4.i = add i8 %vecext.i.i.4.i, %add.i.3.i
+  %vecext.i.i.5.i = extractelement <32 x i8> %0, i64 5
+  %add.i.5.i = add i8 %vecext.i.i.5.i, %add.i.4.i
+  %vecext.i.i.6.i = extractelement <32 x i8> %0, i64 6
+  %add.i.6.i = add i8 %vecext.i.i.6.i, %add.i.5.i
+  %vecext.i.i.7.i = extractelement <32 x i8> %0, i64 7
+  %add.i.7.i = add i8 %vecext.i.i.7.i, %add.i.6.i
+  %vecext.i.i.8.i = extractelement <32 x i8> %0, i64 8
+  %add.i.8.i = add i8 %vecext.i.i.8.i, %add.i.7.i
+  %vecext.i.i.9.i = extractelement <32 x i8> %0, i64 9
+  %add.i.9.i = add i8 %vecext.i.i.9.i, %add.i.8.i
+  %vecext.i.i.10.i = extractelement <32 x i8> %0, i64 10
+  %add.i.10.i = add i8 %vecext.i.i.10.i, %add.i.9.i
+  %vecext.i.i.11.i = extractelement <32 x i8> %0, i64 11
+  %add.i.11.i = add i8 %vecext.i.i.11.i, %add.i.10.i
+  %vecext.i.i.12.i = extractelement <32 x i8> %0, i64 12
+  %add.i.12.i = add i8 %vecext.i.i.12.i, %add.i.11.i
+  %vecext.i.i.13.i = extractelement <32 x i8> %0, i64 13
+  %add.i.13.i = add i8 %vecext.i.i.13.i, %add.i.12.i
+  %vecext.i.i.14.i = extractelement <32 x i8> %0, i64 14
+  %add.i.14.i = add i8 %vecext.i.i.14.i, %add.i.13.i
+  %vecext.i.i.15.i = extractelement <32 x i8> %0, i64 15
+  %add.i.15.i = add i8 %vecext.i.i.15.i, %add.i.14.i
+  %vecext.i.i.16.i = extractelement <32 x i8> %0, i64 16
+  %add.i.16.i = add i8 %vecext.i.i.16.i, %add.i.15.i
+  %vecext.i.i.17.i = extractelement <32 x i8> %0, i64 17
+  %add.i.17.i = add i8 %vecext.i.i.17.i, %add.i.16.i
+  %vecext.i.i.18.i = extractelement <32 x i8> %0, i64 18
+  %add.i.18.i = add i8 %vecext.i.i.18.i, %add.i.17.i
+  %vecext.i.i.19.i = extractelement <32 x i8> %0, i64 19
+  %add.i.19.i = add i8 %vecext.i.i.19.i, %add.i.18.i
+  %vecext.i.i.20.i = extractelement <32 x i8> %0, i64 20
+  %add.i.20.i = add i8 %vecext.i.i.20.i, %add.i.19.i
+  %vecext.i.i.21.i = extractelement <32 x i8> %0, i64 21
+  %add.i.21.i = add i8 %vecext.i.i.21.i, %add.i.20.i
+  %vecext.i.i.22.i = extractelement <32 x i8> %0, i64 22
+  %add.i.22.i = add i8 %vecext.i.i.22.i, %add.i.21.i
+  %vecext.i.i.23.i = extractelement <32 x i8> %0, i64 23
+  %add.i.23.i = add i8 %vecext.i.i.23.i, %add.i.22.i
+  %vecext.i.i.24.i = extractelement <32 x i8> %0, i64 24
+  %add.i.24.i = add i8 %vecext.i.i.24.i, %add.i.23.i
+  %vecext.i.i.25.i = extractelement <32 x i8> %0, i64 25
+  %add.i.25.i = add i8 %vecext.i.i.25.i, %add.i.24.i
+  %vecext.i.i.26.i = extractelement <32 x i8> %0, i64 26
+  %add.i.26.i = add i8 %vecext.i.i.26.i, %add.i.25.i
+  %vecext.i.i.27.i = extractelement <32 x i8> %0, i64 27
+  %add.i.27.i = add i8 %vecext.i.i.27.i, %add.i.26.i
+  %vecext.i.i.28.i = extractelement <32 x i8> %0, i64 28
+  %add.i.28.i = add i8 %vecext.i.i.28.i, %add.i.27.i
+  %vecext.i.i.29.i = extractelement <32 x i8> %0, i64 29
+  %add.i.29.i = add i8 %vecext.i.i.29.i, %add.i.28.i
+  %vecext.i.i.30.i = extractelement <32 x i8> %0, i64 30
+  %add.i.30.i = add i8 %vecext.i.i.30.i, %add.i.29.i
+  %vecext.i.i.31.i = extractelement <32 x i8> %0, i64 31
+  %add.i.31.i = add i8 %vecext.i.i.31.i, %add.i.30.i
+  ret i8 %add.i.31.i
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+; int foo(double *A, int n, int m) {
+;   double sum = 0, v1 = 2, v0 = 3;
+;   for (int i=0; i < n; ++i)
+;     sum += 7*A[i*2] + 7*A[i*2+1];
+;   return sum;
+; }
+
+define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) {
+; CHECK-LABEL: @reduce(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP13]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM_014:%.*]] = phi double [ [[ADD6:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i32 [[MUL]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 7.000000e+00, double 7.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[ADD6]] = fadd double [[SUM_014]], [[ADD5]]
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi double [[ADD6]] to i32
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+entry:
+  %cmp13 = icmp sgt i32 %n, 0
+  br i1 %cmp13, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.014 = phi double [ %add6, %for.body ], [ 0.000000e+00, %entry ]
+  %mul = shl nsw i32 %i.015, 1
+  %arrayidx = getelementptr inbounds double, double* %A, i32 %mul
+  %0 = load double, double* %arrayidx, align 4
+  %mul1 = fmul double %0, 7.000000e+00
+  %add12 = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds double, double* %A, i32 %add12
+  %1 = load double, double* %arrayidx3, align 4
+  %mul4 = fmul double %1, 7.000000e+00
+  %add5 = fadd double %mul1, %mul4
+  %add6 = fadd double %sum.014, %add5
+  %inc = add nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %phitmp = fptosi double %add6 to i32
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction2.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define double @foo(double* nocapture %D) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    br label [[TMP1:%.*]]
+; CHECK:         [[I_02:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP12:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[SUM_01:%.*]] = phi double [ 0.000000e+00, [[TMP0]] ], [ [[TMP11:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i32 [[I_02]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd double [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11]] = fadd double [[SUM_01]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12]] = add nsw i32 [[I_02]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP12]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[TMP13:%.*]], label [[TMP1]]
+; CHECK:         ret double [[TMP11]]
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %i.02 = phi i32 [ 0, %0 ], [ %10, %1 ]
+  %sum.01 = phi double [ 0.000000e+00, %0 ], [ %9, %1 ]
+  %2 = shl nsw i32 %i.02, 1
+  %3 = getelementptr inbounds double, double* %D, i32 %2
+  %4 = load double, double* %3, align 4
+  %A4 = fmul double %4, %4
+  %A42 = fmul double %A4, %A4
+  %5 = or i32 %2, 1
+  %6 = getelementptr inbounds double, double* %D, i32 %5
+  %7 = load double, double* %6, align 4
+  %A7 = fmul double %7, %7
+  %A72 = fmul double %A7, %A7
+  %8 = fadd double %A42, %A72
+  %9 = fadd double %sum.01, %8
+  %10 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %10, 100
+  br i1 %exitcond, label %11, label %1
+
+; <label>:11                                      ; preds = %1
+  ret double %9
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.10.0 -mattr=+sse4.2 | FileCheck %s
+
+
+define i32 @test(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 undef, [[SUM]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 undef, [[ADD_4]]
+; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 undef, [[ADD_5]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 undef, [[ADD_6]]
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
+;
+entry:
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  br label %for.body
+
+for.body:
+  %sum = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
+  %tmp = load i32, i32* %p, align 4
+  %mul = mul i32 %tmp, 42
+  %add = add i32 %mul, %sum
+  %tmp5 = load i32, i32* %arrayidx.1, align 4
+  %mul.1 = mul i32 %tmp5, 42
+  %add.1 = add i32 %mul.1, %add
+  %tmp6 = load i32, i32* %arrayidx.2, align 4
+  %mul.2 = mul i32 %tmp6, 42
+  %add.2 = add i32 %mul.2, %add.1
+  %tmp7 = load i32, i32* %arrayidx.3, align 4
+  %mul.3 = mul i32 %tmp7, 42
+  %add.3 = add i32 %mul.3, %add.2
+  %tmp8 = load i32, i32* %arrayidx.4, align 4
+  %mul.4 = mul i32 %tmp8, 42
+  %add.4 = add i32 %mul.4, %add.3
+  %tmp9 = load i32, i32* %arrayidx.5, align 4
+  %mul.5 = mul i32 %tmp9, 42
+  %add.5 = add i32 %mul.5, %add.4
+  %tmp10 = load i32, i32* %arrayidx.6, align 4
+  %mul.6 = mul i32 %tmp10, 42
+  %add.6 = add i32 %mul.6, %add.5
+  %tmp11 = load i32, i32* %arrayidx.7, align 4
+  %mul.7 = mul i32 %tmp11, 42
+  %add.7 = add i32 %mul.7, %add.6
+  br i1 true, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add.7
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -debug < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 -debug < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE
+; REQUIRES: asserts
+
+; int test_add(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result += p[i];
+;   return result;
+; }
+
+; Vector cost is 5, Scalar cost is 7
+; AVX: Adding cost -2 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+; Vector cost is 6, Scalar cost is 7
+; SSE: Adding cost -1 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+define i32 @test_add(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = add i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = add i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = add i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = add i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = add i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = add i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = add i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_mul(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result *= p[i];
+;   return result;
+; }
+
+define i32 @test_mul(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = mul i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = mul i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = mul i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = mul i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = mul i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = mul i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = mul i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_and(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result &= p[i];
+;   return result;
+; }
+
+define i32 @test_and(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = and i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = and i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = and i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = and i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = and i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = and i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = and i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_or(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result |= p[i];
+;   return result;
+; }
+
+define i32 @test_or(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = or i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = or i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = or i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = or i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = or i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = or i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = or i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_xor(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result ^= p[i];
+;   return result;
+; }
+
+define i32 @test_xor(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = xor i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = xor i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = xor i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = xor i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = xor i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = xor i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = xor i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+define i32 @PR37731(<4 x i32>* noalias nocapture dereferenceable(16) %self) unnamed_addr #0 {
+; CHECK-LABEL: @PR37731(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 6, i32 2, i32 13, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 13, i32 27, i32 21, i32 12>
+; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP0]], <i32 -2, i32 -8, i32 -16, i32 -128>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 undef, undef
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], undef
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %self, align 16
+  %1 = shl <4 x i32> %0, <i32 6, i32 2, i32 13, i32 3>
+  %2 = xor <4 x i32> %1, %0
+  %3 = lshr <4 x i32> %2, <i32 13, i32 27, i32 21, i32 12>
+  %4 = and <4 x i32> %0, <i32 -2, i32 -8, i32 -16, i32 -128>
+  %5 = shl <4 x i32> %4, <i32 18, i32 2, i32 7, i32 13>
+  %6 = xor <4 x i32> %3, %5
+  store <4 x i32> %6, <4 x i32>* %self, align 16
+  %7 = extractelement <4 x i32> %6, i32 0
+  %8 = extractelement <4 x i32> %6, i32 1
+  %9 = xor i32 %7, %8
+  %10 = extractelement <4 x i32> %6, i32 2
+  %11 = xor i32 %9, %10
+  %12 = extractelement <4 x i32> %6, i32 3
+  %13 = xor i32 %11, %12
+  ret i32 %13
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_horcost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_horcost.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_horcost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define i32 @foo(i32* %diff) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[M2:%.*]] = alloca [8 x [8 x i32]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [8 x [8 x i32]]* [[M2]] to i8*
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DIFF:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP1]], 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], 5
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP1]], 2
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP1]], 6
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP1]], 3
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP1]], 7
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 undef, [[A_088]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
+; CHECK-NEXT:    [[ADD24:%.*]] = add nsw i32 [[ADD10]], undef
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
+; CHECK-NEXT:    [[ADD38:%.*]] = add nsw i32 [[ADD24]], undef
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP13]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP15]], [[A_088]]
+; CHECK-NEXT:    [[ADD52:%.*]] = add nsw i32 [[ADD38]], undef
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
+;
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add52, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
+  %6 = load i32, i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+
+  %add24 = add nsw i32 %add10, %add17
+  %9 = or i64 %1, 2
+  %arrayidx27 = getelementptr inbounds i32, i32* %diff, i64 %9
+  %10 = load i32, i32* %arrayidx27, align 4
+  %11 = or i64 %1, 6
+  %arrayidx30 = getelementptr inbounds i32, i32* %diff, i64 %11
+  %12 = load i32, i32* %arrayidx30, align 4
+  %add31 = add nsw i32 %12, %10
+  %arrayidx34 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 2
+  store i32 %add31, i32* %arrayidx34, align 8
+
+  %add38 = add nsw i32 %add24, %add31
+  %13 = or i64 %1, 3
+  %arrayidx41 = getelementptr inbounds i32, i32* %diff, i64 %13
+  %14 = load i32, i32* %arrayidx41, align 4
+  %15 = or i64 %1, 7
+  %arrayidx44 = getelementptr inbounds i32, i32* %diff, i64 %15
+  %16 = load i32, i32* %arrayidx44, align 4
+
+  %add45 = add nsw i32 %16, %14
+  %arrayidx48 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 3
+  store i32 %add45, i32* %arrayidx48, align 4
+
+  %add52 = add nsw i32 %add38, %add45
+
+  ; YAML:      --- !Passed
+  ; YAML-NEXT: Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            StoresVectorized
+  ; YAML-NEXT: Function:        foo
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
+  ; YAML-NEXT:   - Cost:            '-8'
+  ; YAML-NEXT:   - String:          ' and with tree size '
+  ; YAML-NEXT:   - TreeSize:        '4'
+
+  ; YAML:      --- !Passed
+  ; YAML-NEXT: Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            VectorizedHorizontalReduction
+  ; YAML-NEXT: Function:        foo
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+  ; YAML-NEXT:   - Cost:            '-2'
+  ; YAML-NEXT:   - String:          ' and with tree size '
+  ; YAML-NEXT:   - TreeSize:        '1'
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add52
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_listcost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_listcost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_listcost.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_listcost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define void @vsub2_test(i32* %pin1, i32* %pin2, i32* %pout) #0 {
+; CHECK-LABEL: @vsub2_test(
+; CHECK-NEXT:    br label [[TMP1:%.*]]
+; CHECK:         [[IDX_04:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[PO_03:%.*]] = phi i32* [ [[POUT:%.*]], [[TMP0]] ], [ [[TMP7:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[PTMPI2_02:%.*]] = phi i32* [ [[PIN2:%.*]], [[TMP0]] ], [ [[TMP4:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[PTMPI1_01:%.*]] = phi i32* [ [[PIN1:%.*]], [[TMP0]] ], [ [[TMP2:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[TMP2]] = getelementptr inbounds i32, i32* [[PTMPI1_01]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[PTMPI1_01]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP4]] = getelementptr inbounds i32, i32* [[PTMPI2_02]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[PTMPI2_02]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw i32 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7]] = getelementptr inbounds i32, i32* [[PO_03]], i64 1
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[PO_03]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP8]] = add nuw nsw i32 [[IDX_04]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP8]], 64
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[TMP9:%.*]], label [[TMP1]], !llvm.loop !5
+; CHECK:         ret void
+;
+  br label %1
+
+  %idx.04 = phi i32 [ 0, %0 ], [ %8, %1 ]
+  %po.03 = phi i32* [ %pout, %0 ], [ %7, %1 ]
+  %ptmpi2.02 = phi i32* [ %pin2, %0 ], [ %4, %1 ]
+  %ptmpi1.01 = phi i32* [ %pin1, %0 ], [ %2, %1 ]
+  %2 = getelementptr inbounds i32, i32* %ptmpi1.01, i64 1
+  %3 = load i32, i32* %ptmpi1.01, align 4, !tbaa !1
+  %4 = getelementptr inbounds i32, i32* %ptmpi2.02, i64 1
+  %5 = load i32, i32* %ptmpi2.02, align 4, !tbaa !1
+  %6 = sub nsw i32 %3, %5
+  %7 = getelementptr inbounds i32, i32* %po.03, i64 1
+  ; YAML:      Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            NotBeneficial
+  ; YAML-NEXT: Function:        vsub2_test
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
+  ; YAML-NEXT:   - Cost:            '0'
+  ; YAML-NEXT:   - String:          ' >= '
+  ; YAML-NEXT:   - Treshold:        '0'
+  store i32 %6, i32* %po.03, align 4, !tbaa !1
+  %8 = add nuw nsw i32 %idx.04, 1
+  %exitcond = icmp eq i32 %8, 64
+  br i1 %exitcond, label %9, label %1, !llvm.loop !5
+
+  ret void
+}
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = distinct !{!5, !6, !7}
+!6 = !{!"llvm.loop.vectorize.width", i32 1}
+!7 = !{!"llvm.loop.interleave.count", i32 1}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define i32 @foo(i32* nocapture readonly %diff) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[M2:%.*]] = alloca [8 x [8 x i32]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [8 x [8 x i32]]* [[M2]] to i8*
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD24:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DIFF:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0
+; CHECK-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX6]], align 16
+; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD3]], [[A_088]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP1]], 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP1]], 5
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP8]], [[TMP6]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
+; CHECK-NEXT:    store i32 [[ADD17]], i32* [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ADD24]] = add nsw i32 [[ADD10]], [[ADD17]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 0
+; CHECK-NEXT:    ret i32 [[ADD24]]
+;
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add24, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
+  %6 = load i32, i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+  %add24 = add nsw i32 %add10, %add17
+
+  ; YAML:      Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            NotPossible
+  ; YAML-NEXT: Function:        foo
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'Cannot SLP vectorize list: vectorization was impossible'
+  ; YAML-NEXT:   - String:          ' with available vectorization factors'
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %arraydecay = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 0
+  ret i32 %add24
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+; This type is not supported by SLP
+define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1_0:%.*]] = load x86_fp80, x86_fp80* [[I1:%.*]], align 16
+; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr x86_fp80, x86_fp80* [[I1]], i64 1
+; CHECK-NEXT:    [[I1_1:%.*]] = load x86_fp80, x86_fp80* [[I1_GEP1]], align 16
+; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[I2_GEP0:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[I2:%.*]], i64 0
+; CHECK-NEXT:    [[I2_0:%.*]] = load x86_fp80, x86_fp80* [[I2_GEP0]], align 16
+; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[I2]], i64 1
+; CHECK-NEXT:    [[I2_1:%.*]] = load x86_fp80, x86_fp80* [[I2_GEP1]], align 16
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI0:%.*]] = phi x86_fp80 [ [[I1_0]], [[ENTRY:%.*]] ], [ [[I2_0]], [[THEN]] ]
+; CHECK-NEXT:    [[PHI1:%.*]] = phi x86_fp80 [ [[I1_1]], [[ENTRY]] ], [ [[I2_1]], [[THEN]] ]
+; CHECK-NEXT:    store x86_fp80 [[PHI0]], x86_fp80* [[O:%.*]], align 16
+; CHECK-NEXT:    [[O_GEP1:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[O]], i64 1
+; CHECK-NEXT:    store x86_fp80 [[PHI1]], x86_fp80* [[O_GEP1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i1.0 = load x86_fp80, x86_fp80* %i1, align 16
+  %i1.gep1 = getelementptr x86_fp80, x86_fp80* %i1, i64 1
+  %i1.1 = load x86_fp80, x86_fp80* %i1.gep1, align 16
+  br i1 undef, label %then, label %end
+then:
+  %i2.gep0 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 0
+  %i2.0 = load x86_fp80, x86_fp80* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 1
+  %i2.1 = load x86_fp80, x86_fp80* %i2.gep1, align 16
+  br label %end
+end:
+  %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
+
+  %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
+  store x86_fp80 %phi0, x86_fp80* %o, align 16
+  %o.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %o, i64 1
+  store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
+  ; YAML:      Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            UnsupportedType
+  ; YAML-NEXT: Function:        test
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'Cannot SLP vectorize list: type '
+  ; YAML-NEXT:   - String:          x86_fp80 is unsupported by vectorizer
+
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_phi.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer  -S -mtriple=x86_64-unknown -mcpu=corei7-avx | FileCheck %s
+
+%struct.complex = type { float, float }
+
+define  void @foo (%struct.complex* %A, %struct.complex* %B, %struct.complex* %Result) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 256, 0
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul float [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul float [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul float [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul float [[TMP5]], [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18]] = fadd float [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19]] = fadd float [[TMP2]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store float [[TMP18]], float* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
+; CHECK-NEXT:    store float [[TMP19]], float* [[TMP23]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = add i64 256, 0
+  br label %loop
+
+loop:
+  %1 = phi i64 [ 0, %entry ], [ %20, %loop ]
+  %2 = phi float [ 0.000000e+00, %entry ], [ %19, %loop ]
+  %3 = phi float [ 0.000000e+00, %entry ], [ %18, %loop ]
+  %4 = getelementptr inbounds %"struct.complex", %"struct.complex"* %A, i64 %1, i32 0
+  %5 = load float, float* %4, align 4
+  %6 = getelementptr inbounds %"struct.complex", %"struct.complex"* %A, i64 %1, i32 1
+  %7 = load float, float* %6, align 4
+  %8 = getelementptr inbounds %"struct.complex", %"struct.complex"* %B, i64 %1, i32 0
+  %9 = load float, float* %8, align 4
+  %10 = getelementptr inbounds %"struct.complex", %"struct.complex"* %B, i64 %1, i32 1
+  %11 = load float, float* %10, align 4
+  %12 = fmul float %5, %9
+  %13 = fmul float %7, %11
+  %14 = fsub float %12, %13
+  %15 = fmul float %7, %9
+  %16 = fmul float %5, %11
+  %17 = fadd float %15, %16
+  %18 = fadd float %3, %14
+  %19 = fadd float %2, %17
+  %20 = add nuw nsw i64 %1, 1
+  %21 = icmp eq i64 %20, %0
+  br i1 %21, label %exit, label %loop
+
+exit:
+  %22 = getelementptr inbounds %"struct.complex", %"struct.complex"* %Result,  i32 0, i32 0
+  store float %18, float* %22, align 4
+  %23 = getelementptr inbounds %"struct.complex", %"struct.complex"* %Result,  i32 0, i32 1
+  store float %19, float* %23, align 4
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @hoge() {
+; CHECK-LABEL: @hoge(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP:%.*]] = select i1 undef, i16 undef, i16 15
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> <i32 63, i32 undef>, [[REORDER_SHUFFLE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef
+; CHECK-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE8]], <i32 undef, i32 15, i32 31, i32 47>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], undef
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], undef
+; CHECK-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP10:%.*]] = icmp sgt <4 x i32> [[TMP5]], [[RDX_SHUF9]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT11:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP10]], <4 x i32> [[TMP5]], <4 x i32> [[RDX_SHUF9]]
+; CHECK-NEXT:    [[RDX_SHUF12:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT11]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP13:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT11]], [[RDX_SHUF12]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT14:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP13]], <4 x i32> [[RDX_MINMAX_SELECT11]], <4 x i32> [[RDX_SHUF12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT14]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP19]], 63
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP27]], undef
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 undef, i32 [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sgt i32 [[TMP32]], [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP29]], i32 [[TMP32]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP37]], [[TMP34]]
+; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP34]], i32 [[TMP37]]
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP42]], [[TMP39]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP9]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP9]], <4 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i32 [[TMP10]], undef
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA]], undef
+; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA]], i32 undef
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA4]], undef
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA4]], i32 undef
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef
+; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA5]], i32 undef
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i32 [[OP_EXTRA6]], undef
+; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = select i1 [[TMP15]], i32 [[OP_EXTRA6]], i32 undef
+; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP39]], i32 [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA7]]
+; CHECK-NEXT:    unreachable
+;
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:                                              ; preds = %bb
+  ret void
+
+bb2:                                              ; preds = %bb
+  %tmp = select i1 undef, i16 undef, i16 15
+  %tmp3 = sext i16 undef to i32
+  %tmp4 = sext i16 %tmp to i32
+  %tmp5 = sub nsw i32 undef, %tmp4
+  %tmp6 = sub i32 %tmp5, undef
+  %tmp7 = sub nsw i32 63, %tmp3
+  %tmp8 = sub i32 %tmp7, undef
+  %tmp9 = add i32 %tmp8, undef
+  %tmp10 = add nsw i32 %tmp6, 15
+  %tmp11 = icmp sgt i32 %tmp9, %tmp10
+  %tmp12 = select i1 %tmp11, i32 %tmp9, i32 %tmp10
+  %tmp13 = add nsw i32 %tmp6, 31
+  %tmp14 = icmp sgt i32 %tmp12, %tmp13
+  %tmp15 = select i1 %tmp14, i32 %tmp12, i32 %tmp13
+  %tmp16 = add nsw i32 %tmp6, 47
+  %tmp17 = icmp sgt i32 %tmp15, %tmp16
+  %tmp18 = select i1 %tmp17, i32 %tmp15, i32 %tmp16
+  %tmp19 = select i1 undef, i32 %tmp18, i32 undef
+  %tmp20 = icmp sgt i32 %tmp19, 63
+  %tmp21 = sub nsw i32 undef, %tmp3
+  %tmp22 = sub i32 %tmp21, undef
+  %tmp23 = sub nsw i32 undef, %tmp4
+  %tmp24 = sub i32 %tmp23, undef
+  %tmp25 = add nsw i32 %tmp24, -49
+  %tmp26 = icmp sgt i32 %tmp25, undef
+  %tmp27 = select i1 %tmp26, i32 undef, i32 %tmp25
+  %tmp28 = icmp sgt i32 %tmp27, undef
+  %tmp29 = select i1 %tmp28, i32 undef, i32 %tmp27
+  %tmp30 = add nsw i32 %tmp22, -33
+  %tmp31 = icmp sgt i32 %tmp30, undef
+  %tmp32 = select i1 %tmp31, i32 undef, i32 %tmp30
+  %tmp33 = icmp sgt i32 %tmp32, %tmp29
+  %tmp34 = select i1 %tmp33, i32 %tmp29, i32 %tmp32
+  %tmp35 = add nsw i32 %tmp24, -33
+  %tmp36 = icmp sgt i32 %tmp35, undef
+  %tmp37 = select i1 %tmp36, i32 undef, i32 %tmp35
+  %tmp38 = icmp sgt i32 %tmp37, %tmp34
+  %tmp39 = select i1 %tmp38, i32 %tmp34, i32 %tmp37
+  %tmp40 = add nsw i32 %tmp22, -17
+  %tmp41 = icmp sgt i32 %tmp40, undef
+  %tmp42 = select i1 %tmp41, i32 undef, i32 %tmp40
+  %tmp43 = icmp sgt i32 %tmp42, %tmp39
+  %tmp44 = select i1 %tmp43, i32 %tmp39, i32 %tmp42
+  %tmp45 = icmp sgt i32 undef, %tmp44
+  unreachable
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/resched.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/resched.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/resched.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/resched.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+%"struct.std::array" = type { [32 x i8] }
+
+; Function Attrs: nounwind uwtable
+define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() unnamed_addr #0 align 2 {
+; CHECK-LABEL: @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]]
+; CHECK:       if.then22.i:
+; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
+; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3
+; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], <i32 9, i32 10, i32 11, i32 12>
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
+; CHECK-NEXT:    [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
+; CHECK-NEXT:    [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
+; CHECK-NEXT:    [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
+; CHECK-NEXT:    [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
+; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> [[TMP43]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
+; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end50.i:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %if.end50.i, label %if.then22.i
+
+if.then22.i:                                      ; preds = %entry
+  %sub.i = add nsw i32 undef, -1
+  %conv31.i = and i32 undef, %sub.i
+  %0 = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
+  %1 = trunc i32 %sub.i to i8
+  %conv.i.i1199 = and i8 %1, 1
+  store i8 %conv.i.i1199, i8* %0, align 1
+  %shr.i.i = lshr i32 %conv31.i, 1
+  %2 = trunc i32 %shr.i.i to i8
+  %conv.1.i.i = and i8 %2, 1
+  %arrayidx.i.i7.1.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
+  store i8 %conv.1.i.i, i8* %arrayidx.i.i7.1.i.i, align 1
+  %shr.1.i.i = lshr i32 %conv31.i, 2
+  %3 = trunc i32 %shr.1.i.i to i8
+  %conv.2.i.i = and i8 %3, 1
+  %arrayidx.i.i7.2.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
+  store i8 %conv.2.i.i, i8* %arrayidx.i.i7.2.i.i, align 1
+  %shr.2.i.i = lshr i32 %conv31.i, 3
+  %4 = trunc i32 %shr.2.i.i to i8
+  %conv.3.i.i = and i8 %4, 1
+  %arrayidx.i.i7.3.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3
+  store i8 %conv.3.i.i, i8* %arrayidx.i.i7.3.i.i, align 1
+  %shr.3.i.i = lshr i32 %conv31.i, 4
+  %5 = trunc i32 %shr.3.i.i to i8
+  %conv.4.i.i = and i8 %5, 1
+  %arrayidx.i.i7.4.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4
+  store i8 %conv.4.i.i, i8* %arrayidx.i.i7.4.i.i, align 1
+  %shr.4.i.i = lshr i32 %conv31.i, 5
+  %6 = trunc i32 %shr.4.i.i to i8
+  %conv.5.i.i = and i8 %6, 1
+  %arrayidx.i.i7.5.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
+  store i8 %conv.5.i.i, i8* %arrayidx.i.i7.5.i.i, align 1
+  %shr.5.i.i = lshr i32 %conv31.i, 6
+  %7 = trunc i32 %shr.5.i.i to i8
+  %conv.6.i.i = and i8 %7, 1
+  %arrayidx.i.i7.6.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
+  store i8 %conv.6.i.i, i8* %arrayidx.i.i7.6.i.i, align 1
+  %shr.6.i.i = lshr i32 %conv31.i, 7
+  %8 = trunc i32 %shr.6.i.i to i8
+  %conv.7.i.i = and i8 %8, 1
+  %arrayidx.i.i7.7.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
+  store i8 %conv.7.i.i, i8* %arrayidx.i.i7.7.i.i, align 1
+  %shr.7.i.i = lshr i32 %conv31.i, 8
+  %9 = trunc i32 %shr.7.i.i to i8
+  %conv.8.i.i = and i8 %9, 1
+  %arrayidx.i.i7.8.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
+  store i8 %conv.8.i.i, i8* %arrayidx.i.i7.8.i.i, align 1
+  %shr.8.i.i = lshr i32 %conv31.i, 9
+  %10 = trunc i32 %shr.8.i.i to i8
+  %conv.9.i.i = and i8 %10, 1
+  %arrayidx.i.i7.9.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
+  store i8 %conv.9.i.i, i8* %arrayidx.i.i7.9.i.i, align 1
+  %shr.9.i.i = lshr i32 %conv31.i, 10
+  %11 = trunc i32 %shr.9.i.i to i8
+  %conv.10.i.i = and i8 %11, 1
+  %arrayidx.i.i7.10.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
+  store i8 %conv.10.i.i, i8* %arrayidx.i.i7.10.i.i, align 1
+  %shr.10.i.i = lshr i32 %conv31.i, 11
+  %12 = trunc i32 %shr.10.i.i to i8
+  %conv.11.i.i = and i8 %12, 1
+  %arrayidx.i.i7.11.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
+  store i8 %conv.11.i.i, i8* %arrayidx.i.i7.11.i.i, align 1
+  %shr.11.i.i = lshr i32 %conv31.i, 12
+  %13 = trunc i32 %shr.11.i.i to i8
+  %conv.12.i.i = and i8 %13, 1
+  %arrayidx.i.i7.12.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
+  store i8 %conv.12.i.i, i8* %arrayidx.i.i7.12.i.i, align 1
+  %shr.12.i.i = lshr i32 %conv31.i, 13
+  %14 = trunc i32 %shr.12.i.i to i8
+  %conv.13.i.i = and i8 %14, 1
+  %arrayidx.i.i7.13.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
+  store i8 %conv.13.i.i, i8* %arrayidx.i.i7.13.i.i, align 1
+  %shr.13.i.i = lshr i32 %conv31.i, 14
+  %15 = trunc i32 %shr.13.i.i to i8
+  %conv.14.i.i = and i8 %15, 1
+  %arrayidx.i.i7.14.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
+  store i8 %conv.14.i.i, i8* %arrayidx.i.i7.14.i.i, align 1
+  %shr.14.i.i = lshr i32 %conv31.i, 15
+  %16 = trunc i32 %shr.14.i.i to i8
+  %conv.15.i.i = and i8 %16, 1
+  %arrayidx.i.i7.15.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
+  store i8 %conv.15.i.i, i8* %arrayidx.i.i7.15.i.i, align 1
+  unreachable
+
+if.end50.i:                                       ; preds = %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/return.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/return.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/return.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/return.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "x86_64--linux-gnu"
+
+ at a = common global [4 x double] zeroinitializer, align 8
+ at b = common global [4 x double] zeroinitializer, align 8
+
+; [4], b[4];
+; double foo() {
+;  double sum =0;
+;  sum = (a[0]+b[0]) + (a[1]+b[1]);
+;  return sum;
+; }
+
+define double @return1() {
+; CHECK-LABEL: @return1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([4 x double]* @a to <2 x double>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([4 x double]* @b to <2 x double>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret double [[ADD2]]
+;
+entry:
+  %a0 = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @a, i32 0, i32 0), align 8
+  %b0 = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @b, i32 0, i32 0), align 8
+  %add0 = fadd double %a0, %b0
+  %a1 = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @a, i32 0, i32 1), align 8
+  %b1 = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @b, i32 0, i32 1), align 8
+  %add1 = fadd double %a1, %b1
+  %add2 = fadd double %add0, %add1
+  ret double %add2
+}
+
+; double hadd(double *x) {
+;   return ((x[0] + x[2]) + (x[1] + x[3]));
+; }
+
+define double @return2(double* nocapture readonly %x) {
+; CHECK-LABEL: @return2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[X]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[X]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret double [[ADD5]]
+;
+entry:
+  %x0 = load double, double* %x, align 4
+  %arrayidx1 = getelementptr inbounds double, double* %x, i32 2
+  %x2 = load double, double* %arrayidx1, align 4
+  %add3 = fadd double %x0, %x2
+  %arrayidx2 = getelementptr inbounds double, double* %x, i32 1
+  %x1 = load double, double* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds double, double* %x, i32 3
+  %x3 = load double, double* %arrayidx3, align 4
+  %add4 = fadd double %x1, %x3
+  %add5 = fadd double %add3, %add4
+  ret double %add5
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+define float @dotf(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @dotf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = fmul fast <4 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+entry:
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecext1 = extractelement <4 x float> %y, i32 0
+  %mul = fmul fast float %vecext, %vecext1
+  %vecext.1 = extractelement <4 x float> %x, i32 1
+  %vecext1.1 = extractelement <4 x float> %y, i32 1
+  %mul.1 = fmul fast float %vecext.1, %vecext1.1
+  %add.1 = fadd fast float %mul.1, %mul
+  %vecext.2 = extractelement <4 x float> %x, i32 2
+  %vecext1.2 = extractelement <4 x float> %y, i32 2
+  %mul.2 = fmul fast float %vecext.2, %vecext1.2
+  %add.2 = fadd fast float %mul.2, %add.1
+  %vecext.3 = extractelement <4 x float> %x, i32 3
+  %vecext1.3 = extractelement <4 x float> %y, i32 3
+  %mul.3 = fmul fast float %vecext.3, %vecext1.3
+  %add.3 = fadd fast float %mul.3, %add.2
+  ret float %add.3
+}
+
+define double @dotd(<4 x double>* byval nocapture readonly align 32, <4 x double>* byval nocapture readonly align 32) {
+; CHECK-LABEL: @dotd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[TMP0:%.*]], align 32
+; CHECK-NEXT:    [[Y:%.*]] = load <4 x double>, <4 x double>* [[TMP1:%.*]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[X]], [[Y]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+entry:
+  %x = load <4 x double>, <4 x double>* %0, align 32
+  %y = load <4 x double>, <4 x double>* %1, align 32
+  %vecext = extractelement <4 x double> %x, i32 0
+  %vecext1 = extractelement <4 x double> %y, i32 0
+  %mul = fmul fast double %vecext, %vecext1
+  %vecext.1 = extractelement <4 x double> %x, i32 1
+  %vecext1.1 = extractelement <4 x double> %y, i32 1
+  %mul.1 = fmul fast double %vecext.1, %vecext1.1
+  %add.1 = fadd fast double %mul.1, %mul
+  %vecext.2 = extractelement <4 x double> %x, i32 2
+  %vecext1.2 = extractelement <4 x double> %y, i32 2
+  %mul.2 = fmul fast double %vecext.2, %vecext1.2
+  %add.2 = fadd fast double %mul.2, %add.1
+  %vecext.3 = extractelement <4 x double> %x, i32 3
+  %vecext1.3 = extractelement <4 x double> %y, i32 3
+  %mul.3 = fmul fast double %vecext.3, %vecext1.3
+  %add.3 = fadd fast double %mul.3, %add.2
+  ret double %add.3
+}
+
+define float @dotfq(<4 x float>* nocapture readonly %x, <4 x float>* nocapture readonly %y) {
+; CHECK-LABEL: @dotfq(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[Y:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+entry:
+  %0 = load <4 x float>, <4 x float>* %x, align 16
+  %1 = load <4 x float>, <4 x float>* %y, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %vecext1 = extractelement <4 x float> %1, i32 0
+  %mul = fmul fast float %vecext1, %vecext
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %vecext1.1 = extractelement <4 x float> %1, i32 1
+  %mul.1 = fmul fast float %vecext1.1, %vecext.1
+  %add.1 = fadd fast float %mul.1, %mul
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %vecext1.2 = extractelement <4 x float> %1, i32 2
+  %mul.2 = fmul fast float %vecext1.2, %vecext.2
+  %add.2 = fadd fast float %mul.2, %add.1
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %vecext1.3 = extractelement <4 x float> %1, i32 3
+  %mul.3 = fmul fast float %vecext1.3, %vecext.3
+  %add.3 = fadd fast float %mul.3, %add.2
+  ret float %add.3
+}
+
+define double @dotdq(<4 x double>* nocapture readonly %x, <4 x double>* nocapture readonly %y) {
+; CHECK-LABEL: @dotdq(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[X:%.*]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[Y:%.*]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+entry:
+  %0 = load <4 x double>, <4 x double>* %x, align 32
+  %1 = load <4 x double>, <4 x double>* %y, align 32
+  %vecext = extractelement <4 x double> %0, i32 0
+  %vecext1 = extractelement <4 x double> %1, i32 0
+  %mul = fmul fast double %vecext1, %vecext
+  %vecext.1 = extractelement <4 x double> %0, i32 1
+  %vecext1.1 = extractelement <4 x double> %1, i32 1
+  %mul.1 = fmul fast double %vecext1.1, %vecext.1
+  %add.1 = fadd fast double %mul.1, %mul
+  %vecext.2 = extractelement <4 x double> %0, i32 2
+  %vecext1.2 = extractelement <4 x double> %1, i32 2
+  %mul.2 = fmul fast double %vecext1.2, %vecext.2
+  %add.2 = fadd fast double %mul.2, %add.1
+  %vecext.3 = extractelement <4 x double> %0, i32 3
+  %vecext1.3 = extractelement <4 x double> %1, i32 3
+  %mul.3 = fmul fast double %vecext1.3, %vecext.3
+  %add.3 = fadd fast double %mul.3, %add.2
+  ret double %add.3
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/rgb_phi.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/rgb_phi.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/rgb_phi.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+; We disable the vectorization of <3 x float> for now
+
+; float foo(float *A) {
+;
+;   float R = A[0];
+;   float G = A[1];
+;   float B = A[2];
+;   for (int i=0; i < 121; i+=3) {
+;     R+=A[i+0]*7;
+;     G+=A[i+1]*8;
+;     B+=A[i+2]*9;
+;   }
+;
+;   return R+G+B;
+; }
+
+define float @foo(float* nocapture readonly %A) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00
+; CHECK-NEXT:    [[ADD4]] = fadd float [[R_030]], [[MUL]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00
+; CHECK-NEXT:    [[ADD9]] = fadd float [[G_031]], [[MUL8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00
+; CHECK-NEXT:    [[ADD14]] = fadd float [[B_032]], [[MUL13]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]]
+; CHECK:       for.body.for.body_crit_edge:
+; CHECK-NEXT:    [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]]
+; CHECK-NEXT:    [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]]
+; CHECK-NEXT:    ret float [[ADD17]]
+;
+entry:
+  %0 = load float, float* %A, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %A, i64 1
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 2
+  %2 = load float, float* %arrayidx2, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.for.body_crit_edge, %entry
+  %3 = phi float [ %0, %entry ], [ %.pre, %for.body.for.body_crit_edge ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body.for.body_crit_edge ]
+  %B.032 = phi float [ %2, %entry ], [ %add14, %for.body.for.body_crit_edge ]
+  %G.031 = phi float [ %1, %entry ], [ %add9, %for.body.for.body_crit_edge ]
+  %R.030 = phi float [ %0, %entry ], [ %add4, %for.body.for.body_crit_edge ]
+  %mul = fmul float %3, 7.000000e+00
+  %add4 = fadd float %R.030, %mul
+  %4 = add nsw i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds float, float* %A, i64 %4
+  %5 = load float, float* %arrayidx7, align 4
+  %mul8 = fmul float %5, 8.000000e+00
+  %add9 = fadd float %G.031, %mul8
+  %6 = add nsw i64 %indvars.iv, 2
+  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %6
+  %7 = load float, float* %arrayidx12, align 4
+  %mul13 = fmul float %7, 9.000000e+00
+  %add14 = fadd float %B.032, %mul13
+  %indvars.iv.next = add i64 %indvars.iv, 3
+  %8 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %8, 121
+  br i1 %cmp, label %for.body.for.body_crit_edge, label %for.end
+
+for.body.for.body_crit_edge:                      ; preds = %for.body
+  %arrayidx3.phi.trans.insert = getelementptr inbounds float, float* %A, i64 %indvars.iv.next
+  %.pre = load float, float* %arrayidx3.phi.trans.insert, align 4
+  br label %for.body
+
+for.end:                                          ; preds = %for.body
+  %add16 = fadd float %add4, %add9
+  %add17 = fadd float %add16, %add14
+  ret float %add17
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/saxpy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/saxpy.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/saxpy.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/saxpy.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; SLP vectorization example from http://cs.stanford.edu/people/eschkufz/research/asplos291-schkufza.pdf
+define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a, i64 %i) {
+; CHECK-LABEL: @SAXPY(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = getelementptr inbounds i32, i32* %x, i64 %i
+  %2 = load i32, i32* %1, align 4
+  %3 = mul nsw i32 %2, %a
+  %4 = getelementptr inbounds i32, i32* %y, i64 %i
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %3, %5
+  store i32 %6, i32* %1, align 4
+  %7 = add i64 %i, 1
+  %8 = getelementptr inbounds i32, i32* %x, i64 %7
+  %9 = load i32, i32* %8, align 4
+  %10 = mul nsw i32 %9, %a
+  %11 = getelementptr inbounds i32, i32* %y, i64 %7
+  %12 = load i32, i32* %11, align 4
+  %13 = add nsw i32 %10, %12
+  store i32 %13, i32* %8, align 4
+  %14 = add i64 %i, 2
+  %15 = getelementptr inbounds i32, i32* %x, i64 %14
+  %16 = load i32, i32* %15, align 4
+  %17 = mul nsw i32 %16, %a
+  %18 = getelementptr inbounds i32, i32* %y, i64 %14
+  %19 = load i32, i32* %18, align 4
+  %20 = add nsw i32 %17, %19
+  store i32 %20, i32* %15, align 4
+  %21 = add i64 %i, 3
+  %22 = getelementptr inbounds i32, i32* %x, i64 %21
+  %23 = load i32, i32* %22, align 4
+  %24 = mul nsw i32 %23, %a
+  %25 = getelementptr inbounds i32, i32* %y, i64 %21
+  %26 = load i32, i32* %25, align 4
+  %27 = add nsw i32 %24, %26
+  store i32 %27, i32* %22, align 4
+  ret void
+}
+
+; Make sure we don't crash on this one.
+define void @SAXPY_crash(i32* noalias nocapture %x, i32* noalias nocapture %y, i64 %i) {
+; CHECK-LABEL: @SAXPY_crash(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[I:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 undef, [[TMP4]]
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i32 undef, [[TMP9]]
+; CHECK-NEXT:    store i32 [[TMP10]], i32* [[TMP7]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = add i64 %i, 1
+  %2 = getelementptr inbounds i32, i32* %x, i64 %1
+  %3 = getelementptr inbounds i32, i32* %y, i64 %1
+  %4 = load i32, i32* %3, align 4
+  %5 = add nsw i32 undef, %4
+  store i32 %5, i32* %2, align 4
+  %6 = add i64 %i, 2
+  %7 = getelementptr inbounds i32, i32* %x, i64 %6
+  %8 = getelementptr inbounds i32, i32* %y, i64 %6
+  %9 = load i32, i32* %8, align 4
+  %10 = add nsw i32 undef, %9
+  store i32 %10, i32* %7, align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -slp-vectorizer -mcpu=bdver1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at a = common local_unnamed_addr global [1 x i32] zeroinitializer, align 4
+ at b = common local_unnamed_addr global [1 x i32] zeroinitializer, align 4
+
+define i32 @slp_schedule_bundle() local_unnamed_addr #0 {
+; CHECK-LABEL: @slp_schedule_bundle(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
+; CHECK-NEXT:    [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31
+; CHECK-NEXT:    [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1
+; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4
+; CHECK-NEXT:    [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31
+; CHECK-NEXT:    [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1
+; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i64 0, i64 0), align 4
+  %.lobit = lshr i32 %0, 31
+  %.lobit.not = xor i32 %.lobit, 1
+  store i32 %.lobit.not, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i64 0, i64 0), align 4
+  %1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i64 1, i64 0), align 4
+  %.lobit.1 = lshr i32 %1, 31
+  %.lobit.not.1 = xor i32 %.lobit.1, 1
+  store i32 %.lobit.not.1, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i64 1, i64 0), align 4
+  %2 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 2, i64 0), align 4
+  %.lobit.2 = lshr i32 %2, 31
+  %.lobit.not.2 = xor i32 %.lobit.2, 1
+  store i32 %.lobit.not.2, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 2, i64 0), align 4
+  %3 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 3, i64 0), align 4
+  %.lobit.3 = lshr i32 %3, 31
+  %.lobit.not.3 = xor i32 %.lobit.3, 1
+  store i32 %.lobit.not.3, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 3, i64 0), align 4
+  %4 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
+  %.lobit.4 = lshr i32 %4, 31
+  %.lobit.not.4 = xor i32 %.lobit.4, 1
+  store i32 %.lobit.not.4, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4
+  %5 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4
+  %.lobit.5 = lshr i32 %5, 31
+  %.lobit.not.5 = xor i32 %.lobit.5, 1
+  store i32 %.lobit.not.5, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4
+  ret i32 undef
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/schedule_budget.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/schedule_budget.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/schedule_budget.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S  -slp-schedule-budget=16 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Test if the budget for the scheduling region size works.
+; We test with a reduced budget of 16 which should prevent vectorizing the loads.
+
+declare void @unknown()
+
+define void @test(float * %a, float * %b, float * %c, float * %d) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, float* [[A:%.*]]
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
+; CHECK-NEXT:    [[L1:%.*]] = load float, float* [[A1]]
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
+; CHECK-NEXT:    [[L2:%.*]] = load float, float* [[A2]]
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3
+; CHECK-NEXT:    [[L3:%.*]] = load float, float* [[A3]]
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    store float [[L0]], float* [[B:%.*]]
+; CHECK-NEXT:    [[B1:%.*]] = getelementptr inbounds float, float* [[B]], i64 1
+; CHECK-NEXT:    store float [[L1]], float* [[B1]]
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    store float [[L2]], float* [[B2]]
+; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    store float [[L3]], float* [[B3]]
+; CHECK-NEXT:    [[C1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[C2:%.*]] = getelementptr inbounds float, float* [[C]], i64 2
+; CHECK-NEXT:    [[C3:%.*]] = getelementptr inbounds float, float* [[C]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[C]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 1
+; CHECK-NEXT:    [[D2:%.*]] = getelementptr inbounds float, float* [[D]], i64 2
+; CHECK-NEXT:    [[D3:%.*]] = getelementptr inbounds float, float* [[D]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[D]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  ; Don't vectorize these loads.
+  %l0 = load float, float* %a
+  %a1 = getelementptr inbounds float, float* %a, i64 1
+  %l1 = load float, float* %a1
+  %a2 = getelementptr inbounds float, float* %a, i64 2
+  %l2 = load float, float* %a2
+  %a3 = getelementptr inbounds float, float* %a, i64 3
+  %l3 = load float, float* %a3
+
+  ; some unrelated instructions inbetween to enlarge the scheduling region
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+
+  ; Don't vectorize these stores because their operands are too far away.
+  store float %l0, float* %b
+  %b1 = getelementptr inbounds float, float* %b, i64 1
+  store float %l1, float* %b1
+  %b2 = getelementptr inbounds float, float* %b, i64 2
+  store float %l2, float* %b2
+  %b3 = getelementptr inbounds float, float* %b, i64 3
+  store float %l3, float* %b3
+
+  ; But still vectorize the following instructions, because even if the budget
+  ; is exceeded there is a minimum region size.
+  %l4 = load float, float* %c
+  %c1 = getelementptr inbounds float, float* %c, i64 1
+  %l5 = load float, float* %c1
+  %c2 = getelementptr inbounds float, float* %c, i64 2
+  %l6 = load float, float* %c2
+  %c3 = getelementptr inbounds float, float* %c, i64 3
+  %l7 = load float, float* %c3
+
+  store float %l4, float* %d
+  %d1 = getelementptr inbounds float, float* %d, i64 1
+  store float %l5, float* %d1
+  %d2 = getelementptr inbounds float, float* %d, i64 2
+  store float %l6, float* %d2
+  %d3 = getelementptr inbounds float, float* %d, i64 3
+  store float %l7, float* %d3
+
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/scheduling.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+define i32 @foo(i32* nocapture readonly %diff) #0 {
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: [[S1:%.+]] = add nsw <4 x i32>
+; CHECK: store <4 x i32> [[S1]],
+; CHECK:         [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[S1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[S1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK:         [[ADD52:%.*]] = add nsw i32 [[TMP15]],
+; CHECK:          ret i32 [[ADD52]]
+;
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add52, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
+  %6 = load i32, i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+  %add24 = add nsw i32 %add10, %add17
+  %9 = or i64 %1, 2
+  %arrayidx27 = getelementptr inbounds i32, i32* %diff, i64 %9
+  %10 = load i32, i32* %arrayidx27, align 4
+  %11 = or i64 %1, 6
+  %arrayidx30 = getelementptr inbounds i32, i32* %diff, i64 %11
+  %12 = load i32, i32* %arrayidx30, align 4
+  %add31 = add nsw i32 %12, %10
+  %arrayidx34 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 2
+  store i32 %add31, i32* %arrayidx34, align 8
+  %add38 = add nsw i32 %add24, %add31
+  %13 = or i64 %1, 3
+  %arrayidx41 = getelementptr inbounds i32, i32* %diff, i64 %13
+  %14 = load i32, i32* %arrayidx41, align 4
+  %15 = or i64 %1, 7
+  %arrayidx44 = getelementptr inbounds i32, i32* %diff, i64 %15
+  %16 = load i32, i32* %arrayidx44, align 4
+  %add45 = add nsw i32 %16, %14
+  %arrayidx48 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 3
+  store i32 %add45, i32* %arrayidx48, align 4
+  %add52 = add nsw i32 %add38, %add45
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %arraydecay = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 0
+  call void @ff([8 x i32]* %arraydecay) #1
+  ret i32 %add52
+}
+
+declare void @ff([8 x i32]*) #2
+
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/sext.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/sext.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/sext.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,911 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
+
+;
+; vXi8
+;
+
+define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
+; SSE2-LABEL: @loadext_2i8_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i8_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i8_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %x0 = sext i8 %i0 to i64
+  %x1 = sext i8 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
+; SSE2-LABEL: @loadext_4i8_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
+; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
+; SSE2-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
+; SSE2-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = sext i8 %i0 to i32
+  %x1 = sext i8 %i1 to i32
+  %x2 = sext i8 %i2 to i32
+  %x3 = sext i8 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
+; SSE2-LABEL: @loadext_4i8_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = sext i8 %i0 to i64
+  %x1 = sext i8 %i1 to i64
+  %x2 = sext i8 %i2 to i64
+  %x3 = sext i8 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
+; CHECK-LABEL: @loadext_8i8_to_8i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i16> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = sext i8 %i0 to i16
+  %x1 = sext i8 %i1 to i16
+  %x2 = sext i8 %i2 to i16
+  %x3 = sext i8 %i3 to i16
+  %x4 = sext i8 %i4 to i16
+  %x5 = sext i8 %i5 to i16
+  %x6 = sext i8 %i6 to i16
+  %x7 = sext i8 %i7 to i16
+  %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
+  %v1 = insertelement <8 x i16>   %v0, i16 %x1, i32 1
+  %v2 = insertelement <8 x i16>   %v1, i16 %x2, i32 2
+  %v3 = insertelement <8 x i16>   %v2, i16 %x3, i32 3
+  %v4 = insertelement <8 x i16>   %v3, i16 %x4, i32 4
+  %v5 = insertelement <8 x i16>   %v4, i16 %x5, i32 5
+  %v6 = insertelement <8 x i16>   %v5, i16 %x6, i32 6
+  %v7 = insertelement <8 x i16>   %v6, i16 %x7, i32 7
+  ret <8 x i16> %v7
+}
+
+define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
+; SSE-LABEL: @loadext_8i8_to_8i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX1-LABEL: @loadext_8i8_to_8i32(
+; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX1-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX1-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX1-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX1-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX1-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; AVX1-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; AVX1-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; AVX1-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; AVX1-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i32
+; AVX1-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i32
+; AVX1-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i32
+; AVX1-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i32
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX1-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX1-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX1-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX1-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX1-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; AVX1-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; AVX1-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; AVX1-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; AVX1-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX2-LABEL: @loadext_8i8_to_8i32(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX2-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX512-LABEL: @loadext_8i8_to_8i32(
+; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX512-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX512-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX512-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX512-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX512-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX512-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX512-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX512-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX512-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX512-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX512-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX512-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX512-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = sext i8 %i0 to i32
+  %x1 = sext i8 %i1 to i32
+  %x2 = sext i8 %i2 to i32
+  %x3 = sext i8 %i3 to i32
+  %x4 = sext i8 %i4 to i32
+  %x5 = sext i8 %i5 to i32
+  %x6 = sext i8 %i6 to i32
+  %x7 = sext i8 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
+; CHECK-LABEL: @loadext_16i8_to_16i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; CHECK-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; CHECK-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; CHECK-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; CHECK-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; CHECK-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; CHECK-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; CHECK-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; CHECK-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; CHECK-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; CHECK-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; CHECK-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; CHECK-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; CHECK-NEXT:    ret <16 x i16> [[V15]]
+;
+  %p1  = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2  = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3  = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4  = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5  = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6  = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7  = getelementptr inbounds i8, i8* %p0, i64 7
+  %p8  = getelementptr inbounds i8, i8* %p0, i64 8
+  %p9  = getelementptr inbounds i8, i8* %p0, i64 9
+  %p10 = getelementptr inbounds i8, i8* %p0, i64 10
+  %p11 = getelementptr inbounds i8, i8* %p0, i64 11
+  %p12 = getelementptr inbounds i8, i8* %p0, i64 12
+  %p13 = getelementptr inbounds i8, i8* %p0, i64 13
+  %p14 = getelementptr inbounds i8, i8* %p0, i64 14
+  %p15 = getelementptr inbounds i8, i8* %p0, i64 15
+  %i0  = load i8, i8* %p0,  align 1
+  %i1  = load i8, i8* %p1,  align 1
+  %i2  = load i8, i8* %p2,  align 1
+  %i3  = load i8, i8* %p3,  align 1
+  %i4  = load i8, i8* %p4,  align 1
+  %i5  = load i8, i8* %p5,  align 1
+  %i6  = load i8, i8* %p6,  align 1
+  %i7  = load i8, i8* %p7,  align 1
+  %i8  = load i8, i8* %p8,  align 1
+  %i9  = load i8, i8* %p9,  align 1
+  %i10 = load i8, i8* %p10, align 1
+  %i11 = load i8, i8* %p11, align 1
+  %i12 = load i8, i8* %p12, align 1
+  %i13 = load i8, i8* %p13, align 1
+  %i14 = load i8, i8* %p14, align 1
+  %i15 = load i8, i8* %p15, align 1
+  %x0  = sext i8 %i0  to i16
+  %x1  = sext i8 %i1  to i16
+  %x2  = sext i8 %i2  to i16
+  %x3  = sext i8 %i3  to i16
+  %x4  = sext i8 %i4  to i16
+  %x5  = sext i8 %i5  to i16
+  %x6  = sext i8 %i6  to i16
+  %x7  = sext i8 %i7  to i16
+  %x8  = sext i8 %i8  to i16
+  %x9  = sext i8 %i9  to i16
+  %x10 = sext i8 %i10 to i16
+  %x11 = sext i8 %i11 to i16
+  %x12 = sext i8 %i12 to i16
+  %x13 = sext i8 %i13 to i16
+  %x14 = sext i8 %i14 to i16
+  %x15 = sext i8 %i15 to i16
+  %v0  = insertelement <16 x i16> undef, i16 %x0,  i32 0
+  %v1  = insertelement <16 x i16>  %v0,  i16 %x1,  i32 1
+  %v2  = insertelement <16 x i16>  %v1,  i16 %x2,  i32 2
+  %v3  = insertelement <16 x i16>  %v2,  i16 %x3,  i32 3
+  %v4  = insertelement <16 x i16>  %v3,  i16 %x4,  i32 4
+  %v5  = insertelement <16 x i16>  %v4,  i16 %x5,  i32 5
+  %v6  = insertelement <16 x i16>  %v5,  i16 %x6,  i32 6
+  %v7  = insertelement <16 x i16>  %v6,  i16 %x7,  i32 7
+  %v8  = insertelement <16 x i16>  %v7,  i16 %x8,  i32 8
+  %v9  = insertelement <16 x i16>  %v8,  i16 %x9,  i32 9
+  %v10 = insertelement <16 x i16>  %v9,  i16 %x10, i32 10
+  %v11 = insertelement <16 x i16>  %v10, i16 %x11, i32 11
+  %v12 = insertelement <16 x i16>  %v11, i16 %x12, i32 12
+  %v13 = insertelement <16 x i16>  %v12, i16 %x13, i32 13
+  %v14 = insertelement <16 x i16>  %v13, i16 %x14, i32 14
+  %v15 = insertelement <16 x i16>  %v14, i16 %x15, i32 15
+  ret <16 x i16> %v15
+}
+
+;
+; vXi16
+;
+
+define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
+; SSE2-LABEL: @loadext_2i16_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i16_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i16_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %x0 = sext i16 %i0 to i64
+  %x1 = sext i16 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
+; CHECK-LABEL: @loadext_4i16_to_4i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = sext i16 %i0 to i32
+  %x1 = sext i16 %i1 to i32
+  %x2 = sext i16 %i2 to i32
+  %x3 = sext i16 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
+; SSE2-LABEL: @loadext_4i16_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = sext i16 %i0 to i64
+  %x1 = sext i16 %i1 to i64
+  %x2 = sext i16 %i2 to i64
+  %x3 = sext i16 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
+; CHECK-LABEL: @loadext_8i16_to_8i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %p4 = getelementptr inbounds i16, i16* %p0, i64 4
+  %p5 = getelementptr inbounds i16, i16* %p0, i64 5
+  %p6 = getelementptr inbounds i16, i16* %p0, i64 6
+  %p7 = getelementptr inbounds i16, i16* %p0, i64 7
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %i4 = load i16, i16* %p4, align 1
+  %i5 = load i16, i16* %p5, align 1
+  %i6 = load i16, i16* %p6, align 1
+  %i7 = load i16, i16* %p7, align 1
+  %x0 = sext i16 %i0 to i32
+  %x1 = sext i16 %i1 to i32
+  %x2 = sext i16 %i2 to i32
+  %x3 = sext i16 %i3 to i32
+  %x4 = sext i16 %i4 to i32
+  %x5 = sext i16 %i5 to i32
+  %x6 = sext i16 %i6 to i32
+  %x7 = sext i16 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+;
+; vXi32
+;
+
+define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
+; SSE2-LABEL: @loadext_2i32_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i32_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i32_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %x0 = sext i32 %i0 to i64
+  %x1 = sext i32 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
+; SSE2-LABEL: @loadext_4i32_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i32_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX1-LABEL: @loadext_4i32_to_4i64(
+; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX1-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; AVX1-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX1-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
+; AVX1-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX1-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX2-LABEL: @loadext_4i32_to_4i64(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX2-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX512-LABEL: @loadext_4i32_to_4i64(
+; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX512-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %p2 = getelementptr inbounds i32, i32* %p0, i64 2
+  %p3 = getelementptr inbounds i32, i32* %p0, i64 3
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %i2 = load i32, i32* %p2, align 1
+  %i3 = load i32, i32* %p3, align 1
+  %x0 = sext i32 %i0 to i64
+  %x1 = sext i32 %i1 to i64
+  %x2 = sext i32 %i2 to i64
+  %x3 = sext i32 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-ashr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-ashr.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-ashr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,914 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+ at a64 = common global [8 x i64] zeroinitializer, align 64
+ at b64 = common global [8 x i64] zeroinitializer, align 64
+ at c64 = common global [8 x i64] zeroinitializer, align 64
+ at a32 = common global [16 x i32] zeroinitializer, align 64
+ at b32 = common global [16 x i32] zeroinitializer, align 64
+ at c32 = common global [16 x i32] zeroinitializer, align 64
+ at a16 = common global [32 x i16] zeroinitializer, align 64
+ at b16 = common global [32 x i16] zeroinitializer, align 64
+ at c16 = common global [32 x i16] zeroinitializer, align 64
+ at a8  = common global [64 x i8] zeroinitializer, align 64
+ at b8  = common global [64 x i8] zeroinitializer, align 64
+ at c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @ashr_v8i64() {
+; SSE-LABEL: @ashr_v8i64(
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = ashr i64 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = ashr i64 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = ashr i64 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = ashr i64 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = ashr i64 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = ashr i64 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = ashr i64 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = ashr i64 [[A7]], [[B7]]
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ashr_v8i64(
+; AVX1-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[R0:%.*]] = ashr i64 [[A0]], [[B0]]
+; AVX1-NEXT:    [[R1:%.*]] = ashr i64 [[A1]], [[B1]]
+; AVX1-NEXT:    [[R2:%.*]] = ashr i64 [[A2]], [[B2]]
+; AVX1-NEXT:    [[R3:%.*]] = ashr i64 [[A3]], [[B3]]
+; AVX1-NEXT:    [[R4:%.*]] = ashr i64 [[A4]], [[B4]]
+; AVX1-NEXT:    [[R5:%.*]] = ashr i64 [[A5]], [[B5]]
+; AVX1-NEXT:    [[R6:%.*]] = ashr i64 [[A6]], [[B6]]
+; AVX1-NEXT:    [[R7:%.*]] = ashr i64 [[A7]], [[B7]]
+; AVX1-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; AVX1-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; AVX1-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; AVX1-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; AVX1-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; AVX1-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; AVX1-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; AVX1-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ashr_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @ashr_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @ashr_v8i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = ashr i64 %a0, %b0
+  %r1 = ashr i64 %a1, %b1
+  %r2 = ashr i64 %a2, %b2
+  %r3 = ashr i64 %a3, %b3
+  %r4 = ashr i64 %a4, %b4
+  %r5 = ashr i64 %a5, %b5
+  %r6 = ashr i64 %a6, %b6
+  %r7 = ashr i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @ashr_v16i32() {
+; SSE-LABEL: @ashr_v16i32(
+; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[R0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = ashr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = ashr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = ashr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = ashr i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = ashr i32 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = ashr i32 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = ashr i32 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = ashr i32 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = ashr i32 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = ashr i32 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = ashr i32 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = ashr i32 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = ashr i32 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = ashr i32 [[A15]], [[B15]]
+; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ashr_v16i32(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]]
+; AVX1-NEXT:    [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]]
+; AVX1-NEXT:    [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]]
+; AVX1-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ashr_v16i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @ashr_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @ashr_v16i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = ashr i32 %a0 , %b0
+  %r1  = ashr i32 %a1 , %b1
+  %r2  = ashr i32 %a2 , %b2
+  %r3  = ashr i32 %a3 , %b3
+  %r4  = ashr i32 %a4 , %b4
+  %r5  = ashr i32 %a5 , %b5
+  %r6  = ashr i32 %a6 , %b6
+  %r7  = ashr i32 %a7 , %b7
+  %r8  = ashr i32 %a8 , %b8
+  %r9  = ashr i32 %a9 , %b9
+  %r10 = ashr i32 %a10, %b10
+  %r11 = ashr i32 %a11, %b11
+  %r12 = ashr i32 %a12, %b12
+  %r13 = ashr i32 %a13, %b13
+  %r14 = ashr i32 %a14, %b14
+  %r15 = ashr i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @ashr_v32i16() {
+; SSE-LABEL: @ashr_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = ashr i16 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = ashr i16 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = ashr i16 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = ashr i16 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = ashr i16 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = ashr i16 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = ashr i16 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = ashr i16 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = ashr i16 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = ashr i16 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = ashr i16 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = ashr i16 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = ashr i16 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = ashr i16 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = ashr i16 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = ashr i16 [[A15]], [[B15]]
+; SSE-NEXT:    [[R16:%.*]] = ashr i16 [[A16]], [[B16]]
+; SSE-NEXT:    [[R17:%.*]] = ashr i16 [[A17]], [[B17]]
+; SSE-NEXT:    [[R18:%.*]] = ashr i16 [[A18]], [[B18]]
+; SSE-NEXT:    [[R19:%.*]] = ashr i16 [[A19]], [[B19]]
+; SSE-NEXT:    [[R20:%.*]] = ashr i16 [[A20]], [[B20]]
+; SSE-NEXT:    [[R21:%.*]] = ashr i16 [[A21]], [[B21]]
+; SSE-NEXT:    [[R22:%.*]] = ashr i16 [[A22]], [[B22]]
+; SSE-NEXT:    [[R23:%.*]] = ashr i16 [[A23]], [[B23]]
+; SSE-NEXT:    [[R24:%.*]] = ashr i16 [[A24]], [[B24]]
+; SSE-NEXT:    [[R25:%.*]] = ashr i16 [[A25]], [[B25]]
+; SSE-NEXT:    [[R26:%.*]] = ashr i16 [[A26]], [[B26]]
+; SSE-NEXT:    [[R27:%.*]] = ashr i16 [[A27]], [[B27]]
+; SSE-NEXT:    [[R28:%.*]] = ashr i16 [[A28]], [[B28]]
+; SSE-NEXT:    [[R29:%.*]] = ashr i16 [[A29]], [[B29]]
+; SSE-NEXT:    [[R30:%.*]] = ashr i16 [[A30]], [[B30]]
+; SSE-NEXT:    [[R31:%.*]] = ashr i16 [[A31]], [[B31]]
+; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @ashr_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @ashr_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @ashr_v32i16(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = ashr i16 %a0 , %b0
+  %r1  = ashr i16 %a1 , %b1
+  %r2  = ashr i16 %a2 , %b2
+  %r3  = ashr i16 %a3 , %b3
+  %r4  = ashr i16 %a4 , %b4
+  %r5  = ashr i16 %a5 , %b5
+  %r6  = ashr i16 %a6 , %b6
+  %r7  = ashr i16 %a7 , %b7
+  %r8  = ashr i16 %a8 , %b8
+  %r9  = ashr i16 %a9 , %b9
+  %r10 = ashr i16 %a10, %b10
+  %r11 = ashr i16 %a11, %b11
+  %r12 = ashr i16 %a12, %b12
+  %r13 = ashr i16 %a13, %b13
+  %r14 = ashr i16 %a14, %b14
+  %r15 = ashr i16 %a15, %b15
+  %r16 = ashr i16 %a16, %b16
+  %r17 = ashr i16 %a17, %b17
+  %r18 = ashr i16 %a18, %b18
+  %r19 = ashr i16 %a19, %b19
+  %r20 = ashr i16 %a20, %b20
+  %r21 = ashr i16 %a21, %b21
+  %r22 = ashr i16 %a22, %b22
+  %r23 = ashr i16 %a23, %b23
+  %r24 = ashr i16 %a24, %b24
+  %r25 = ashr i16 %a25, %b25
+  %r26 = ashr i16 %a26, %b26
+  %r27 = ashr i16 %a27, %b27
+  %r28 = ashr i16 %a28, %b28
+  %r29 = ashr i16 %a29, %b29
+  %r30 = ashr i16 %a30, %b30
+  %r31 = ashr i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @ashr_v64i8() {
+; CHECK-LABEL: @ashr_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = ashr i8 %a0 , %b0
+  %r1  = ashr i8 %a1 , %b1
+  %r2  = ashr i8 %a2 , %b2
+  %r3  = ashr i8 %a3 , %b3
+  %r4  = ashr i8 %a4 , %b4
+  %r5  = ashr i8 %a5 , %b5
+  %r6  = ashr i8 %a6 , %b6
+  %r7  = ashr i8 %a7 , %b7
+  %r8  = ashr i8 %a8 , %b8
+  %r9  = ashr i8 %a9 , %b9
+  %r10 = ashr i8 %a10, %b10
+  %r11 = ashr i8 %a11, %b11
+  %r12 = ashr i8 %a12, %b12
+  %r13 = ashr i8 %a13, %b13
+  %r14 = ashr i8 %a14, %b14
+  %r15 = ashr i8 %a15, %b15
+  %r16 = ashr i8 %a16, %b16
+  %r17 = ashr i8 %a17, %b17
+  %r18 = ashr i8 %a18, %b18
+  %r19 = ashr i8 %a19, %b19
+  %r20 = ashr i8 %a20, %b20
+  %r21 = ashr i8 %a21, %b21
+  %r22 = ashr i8 %a22, %b22
+  %r23 = ashr i8 %a23, %b23
+  %r24 = ashr i8 %a24, %b24
+  %r25 = ashr i8 %a25, %b25
+  %r26 = ashr i8 %a26, %b26
+  %r27 = ashr i8 %a27, %b27
+  %r28 = ashr i8 %a28, %b28
+  %r29 = ashr i8 %a29, %b29
+  %r30 = ashr i8 %a30, %b30
+  %r31 = ashr i8 %a31, %b31
+  %r32 = ashr i8 %a32, %b32
+  %r33 = ashr i8 %a33, %b33
+  %r34 = ashr i8 %a34, %b34
+  %r35 = ashr i8 %a35, %b35
+  %r36 = ashr i8 %a36, %b36
+  %r37 = ashr i8 %a37, %b37
+  %r38 = ashr i8 %a38, %b38
+  %r39 = ashr i8 %a39, %b39
+  %r40 = ashr i8 %a40, %b40
+  %r41 = ashr i8 %a41, %b41
+  %r42 = ashr i8 %a42, %b42
+  %r43 = ashr i8 %a43, %b43
+  %r44 = ashr i8 %a44, %b44
+  %r45 = ashr i8 %a45, %b45
+  %r46 = ashr i8 %a46, %b46
+  %r47 = ashr i8 %a47, %b47
+  %r48 = ashr i8 %a48, %b48
+  %r49 = ashr i8 %a49, %b49
+  %r50 = ashr i8 %a50, %b50
+  %r51 = ashr i8 %a51, %b51
+  %r52 = ashr i8 %a52, %b52
+  %r53 = ashr i8 %a53, %b53
+  %r54 = ashr i8 %a54, %b54
+  %r55 = ashr i8 %a55, %b55
+  %r56 = ashr i8 %a56, %b56
+  %r57 = ashr i8 %a57, %b57
+  %r58 = ashr i8 %a58, %b58
+  %r59 = ashr i8 %a59, %b59
+  %r60 = ashr i8 %a60, %b60
+  %r61 = ashr i8 %a61, %b61
+  %r62 = ashr i8 %a62, %b62
+  %r63 = ashr i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-lshr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-lshr.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-lshr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,863 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+ at a64 = common global [8 x i64] zeroinitializer, align 64
+ at b64 = common global [8 x i64] zeroinitializer, align 64
+ at c64 = common global [8 x i64] zeroinitializer, align 64
+ at a32 = common global [16 x i32] zeroinitializer, align 64
+ at b32 = common global [16 x i32] zeroinitializer, align 64
+ at c32 = common global [16 x i32] zeroinitializer, align 64
+ at a16 = common global [32 x i16] zeroinitializer, align 64
+ at b16 = common global [32 x i16] zeroinitializer, align 64
+ at c16 = common global [32 x i16] zeroinitializer, align 64
+ at a8  = common global [64 x i8] zeroinitializer, align 64
+ at b8  = common global [64 x i8] zeroinitializer, align 64
+ at c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @lshr_v8i64() {
+; SSE-LABEL: @lshr_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @lshr_v8i64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
+; AVX1-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
+; AVX1-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @lshr_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @lshr_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @lshr_v8i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = lshr i64 %a0, %b0
+  %r1 = lshr i64 %a1, %b1
+  %r2 = lshr i64 %a2, %b2
+  %r3 = lshr i64 %a3, %b3
+  %r4 = lshr i64 %a4, %b4
+  %r5 = lshr i64 %a5, %b5
+  %r6 = lshr i64 %a6, %b6
+  %r7 = lshr i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @lshr_v16i32() {
+; SSE-LABEL: @lshr_v16i32(
+; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[R0:%.*]] = lshr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = lshr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = lshr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = lshr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = lshr i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = lshr i32 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = lshr i32 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = lshr i32 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = lshr i32 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = lshr i32 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = lshr i32 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = lshr i32 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = lshr i32 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = lshr i32 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = lshr i32 [[A15]], [[B15]]
+; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @lshr_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @lshr_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = lshr <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @lshr_v16i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = lshr i32 %a0 , %b0
+  %r1  = lshr i32 %a1 , %b1
+  %r2  = lshr i32 %a2 , %b2
+  %r3  = lshr i32 %a3 , %b3
+  %r4  = lshr i32 %a4 , %b4
+  %r5  = lshr i32 %a5 , %b5
+  %r6  = lshr i32 %a6 , %b6
+  %r7  = lshr i32 %a7 , %b7
+  %r8  = lshr i32 %a8 , %b8
+  %r9  = lshr i32 %a9 , %b9
+  %r10 = lshr i32 %a10, %b10
+  %r11 = lshr i32 %a11, %b11
+  %r12 = lshr i32 %a12, %b12
+  %r13 = lshr i32 %a13, %b13
+  %r14 = lshr i32 %a14, %b14
+  %r15 = lshr i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @lshr_v32i16() {
+; SSE-LABEL: @lshr_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = lshr i16 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = lshr i16 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = lshr i16 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = lshr i16 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = lshr i16 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = lshr i16 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = lshr i16 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = lshr i16 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = lshr i16 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = lshr i16 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = lshr i16 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = lshr i16 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = lshr i16 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = lshr i16 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = lshr i16 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = lshr i16 [[A15]], [[B15]]
+; SSE-NEXT:    [[R16:%.*]] = lshr i16 [[A16]], [[B16]]
+; SSE-NEXT:    [[R17:%.*]] = lshr i16 [[A17]], [[B17]]
+; SSE-NEXT:    [[R18:%.*]] = lshr i16 [[A18]], [[B18]]
+; SSE-NEXT:    [[R19:%.*]] = lshr i16 [[A19]], [[B19]]
+; SSE-NEXT:    [[R20:%.*]] = lshr i16 [[A20]], [[B20]]
+; SSE-NEXT:    [[R21:%.*]] = lshr i16 [[A21]], [[B21]]
+; SSE-NEXT:    [[R22:%.*]] = lshr i16 [[A22]], [[B22]]
+; SSE-NEXT:    [[R23:%.*]] = lshr i16 [[A23]], [[B23]]
+; SSE-NEXT:    [[R24:%.*]] = lshr i16 [[A24]], [[B24]]
+; SSE-NEXT:    [[R25:%.*]] = lshr i16 [[A25]], [[B25]]
+; SSE-NEXT:    [[R26:%.*]] = lshr i16 [[A26]], [[B26]]
+; SSE-NEXT:    [[R27:%.*]] = lshr i16 [[A27]], [[B27]]
+; SSE-NEXT:    [[R28:%.*]] = lshr i16 [[A28]], [[B28]]
+; SSE-NEXT:    [[R29:%.*]] = lshr i16 [[A29]], [[B29]]
+; SSE-NEXT:    [[R30:%.*]] = lshr i16 [[A30]], [[B30]]
+; SSE-NEXT:    [[R31:%.*]] = lshr i16 [[A31]], [[B31]]
+; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @lshr_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @lshr_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @lshr_v32i16(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = lshr i16 %a0 , %b0
+  %r1  = lshr i16 %a1 , %b1
+  %r2  = lshr i16 %a2 , %b2
+  %r3  = lshr i16 %a3 , %b3
+  %r4  = lshr i16 %a4 , %b4
+  %r5  = lshr i16 %a5 , %b5
+  %r6  = lshr i16 %a6 , %b6
+  %r7  = lshr i16 %a7 , %b7
+  %r8  = lshr i16 %a8 , %b8
+  %r9  = lshr i16 %a9 , %b9
+  %r10 = lshr i16 %a10, %b10
+  %r11 = lshr i16 %a11, %b11
+  %r12 = lshr i16 %a12, %b12
+  %r13 = lshr i16 %a13, %b13
+  %r14 = lshr i16 %a14, %b14
+  %r15 = lshr i16 %a15, %b15
+  %r16 = lshr i16 %a16, %b16
+  %r17 = lshr i16 %a17, %b17
+  %r18 = lshr i16 %a18, %b18
+  %r19 = lshr i16 %a19, %b19
+  %r20 = lshr i16 %a20, %b20
+  %r21 = lshr i16 %a21, %b21
+  %r22 = lshr i16 %a22, %b22
+  %r23 = lshr i16 %a23, %b23
+  %r24 = lshr i16 %a24, %b24
+  %r25 = lshr i16 %a25, %b25
+  %r26 = lshr i16 %a26, %b26
+  %r27 = lshr i16 %a27, %b27
+  %r28 = lshr i16 %a28, %b28
+  %r29 = lshr i16 %a29, %b29
+  %r30 = lshr i16 %a30, %b30
+  %r31 = lshr i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @lshr_v64i8() {
+; CHECK-LABEL: @lshr_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = lshr i8 %a0 , %b0
+  %r1  = lshr i8 %a1 , %b1
+  %r2  = lshr i8 %a2 , %b2
+  %r3  = lshr i8 %a3 , %b3
+  %r4  = lshr i8 %a4 , %b4
+  %r5  = lshr i8 %a5 , %b5
+  %r6  = lshr i8 %a6 , %b6
+  %r7  = lshr i8 %a7 , %b7
+  %r8  = lshr i8 %a8 , %b8
+  %r9  = lshr i8 %a9 , %b9
+  %r10 = lshr i8 %a10, %b10
+  %r11 = lshr i8 %a11, %b11
+  %r12 = lshr i8 %a12, %b12
+  %r13 = lshr i8 %a13, %b13
+  %r14 = lshr i8 %a14, %b14
+  %r15 = lshr i8 %a15, %b15
+  %r16 = lshr i8 %a16, %b16
+  %r17 = lshr i8 %a17, %b17
+  %r18 = lshr i8 %a18, %b18
+  %r19 = lshr i8 %a19, %b19
+  %r20 = lshr i8 %a20, %b20
+  %r21 = lshr i8 %a21, %b21
+  %r22 = lshr i8 %a22, %b22
+  %r23 = lshr i8 %a23, %b23
+  %r24 = lshr i8 %a24, %b24
+  %r25 = lshr i8 %a25, %b25
+  %r26 = lshr i8 %a26, %b26
+  %r27 = lshr i8 %a27, %b27
+  %r28 = lshr i8 %a28, %b28
+  %r29 = lshr i8 %a29, %b29
+  %r30 = lshr i8 %a30, %b30
+  %r31 = lshr i8 %a31, %b31
+  %r32 = lshr i8 %a32, %b32
+  %r33 = lshr i8 %a33, %b33
+  %r34 = lshr i8 %a34, %b34
+  %r35 = lshr i8 %a35, %b35
+  %r36 = lshr i8 %a36, %b36
+  %r37 = lshr i8 %a37, %b37
+  %r38 = lshr i8 %a38, %b38
+  %r39 = lshr i8 %a39, %b39
+  %r40 = lshr i8 %a40, %b40
+  %r41 = lshr i8 %a41, %b41
+  %r42 = lshr i8 %a42, %b42
+  %r43 = lshr i8 %a43, %b43
+  %r44 = lshr i8 %a44, %b44
+  %r45 = lshr i8 %a45, %b45
+  %r46 = lshr i8 %a46, %b46
+  %r47 = lshr i8 %a47, %b47
+  %r48 = lshr i8 %a48, %b48
+  %r49 = lshr i8 %a49, %b49
+  %r50 = lshr i8 %a50, %b50
+  %r51 = lshr i8 %a51, %b51
+  %r52 = lshr i8 %a52, %b52
+  %r53 = lshr i8 %a53, %b53
+  %r54 = lshr i8 %a54, %b54
+  %r55 = lshr i8 %a55, %b55
+  %r56 = lshr i8 %a56, %b56
+  %r57 = lshr i8 %a57, %b57
+  %r58 = lshr i8 %a58, %b58
+  %r59 = lshr i8 %a59, %b59
+  %r60 = lshr i8 %a60, %b60
+  %r61 = lshr i8 %a61, %b61
+  %r62 = lshr i8 %a62, %b62
+  %r63 = lshr i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-shl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-shl.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-shl.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/shift-shl.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,815 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+ at a64 = common global [8 x i64] zeroinitializer, align 64
+ at b64 = common global [8 x i64] zeroinitializer, align 64
+ at c64 = common global [8 x i64] zeroinitializer, align 64
+ at a32 = common global [16 x i32] zeroinitializer, align 64
+ at b32 = common global [16 x i32] zeroinitializer, align 64
+ at c32 = common global [16 x i32] zeroinitializer, align 64
+ at a16 = common global [32 x i16] zeroinitializer, align 64
+ at b16 = common global [32 x i16] zeroinitializer, align 64
+ at c16 = common global [32 x i16] zeroinitializer, align 64
+ at a8  = common global [64 x i8] zeroinitializer, align 64
+ at b8  = common global [64 x i8] zeroinitializer, align 64
+ at c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @shl_v8i64() {
+; SSE-LABEL: @shl_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @shl_v8i64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
+; AVX1-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
+; AVX1-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @shl_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @shl_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @shl_v8i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = shl i64 %a0, %b0
+  %r1 = shl i64 %a1, %b1
+  %r2 = shl i64 %a2, %b2
+  %r3 = shl i64 %a3, %b3
+  %r4 = shl i64 %a4, %b4
+  %r5 = shl i64 %a5, %b5
+  %r6 = shl i64 %a6, %b6
+  %r7 = shl i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @shl_v16i32() {
+; SSE-LABEL: @shl_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = shl <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = shl <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = shl <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @shl_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @shl_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = shl <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @shl_v16i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = shl i32 %a0 , %b0
+  %r1  = shl i32 %a1 , %b1
+  %r2  = shl i32 %a2 , %b2
+  %r3  = shl i32 %a3 , %b3
+  %r4  = shl i32 %a4 , %b4
+  %r5  = shl i32 %a5 , %b5
+  %r6  = shl i32 %a6 , %b6
+  %r7  = shl i32 %a7 , %b7
+  %r8  = shl i32 %a8 , %b8
+  %r9  = shl i32 %a9 , %b9
+  %r10 = shl i32 %a10, %b10
+  %r11 = shl i32 %a11, %b11
+  %r12 = shl i32 %a12, %b12
+  %r13 = shl i32 %a13, %b13
+  %r14 = shl i32 %a14, %b14
+  %r15 = shl i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @shl_v32i16() {
+; SSE-LABEL: @shl_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = shl i16 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = shl i16 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = shl i16 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = shl i16 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = shl i16 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = shl i16 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = shl i16 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = shl i16 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = shl i16 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = shl i16 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = shl i16 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = shl i16 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = shl i16 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = shl i16 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = shl i16 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = shl i16 [[A15]], [[B15]]
+; SSE-NEXT:    [[R16:%.*]] = shl i16 [[A16]], [[B16]]
+; SSE-NEXT:    [[R17:%.*]] = shl i16 [[A17]], [[B17]]
+; SSE-NEXT:    [[R18:%.*]] = shl i16 [[A18]], [[B18]]
+; SSE-NEXT:    [[R19:%.*]] = shl i16 [[A19]], [[B19]]
+; SSE-NEXT:    [[R20:%.*]] = shl i16 [[A20]], [[B20]]
+; SSE-NEXT:    [[R21:%.*]] = shl i16 [[A21]], [[B21]]
+; SSE-NEXT:    [[R22:%.*]] = shl i16 [[A22]], [[B22]]
+; SSE-NEXT:    [[R23:%.*]] = shl i16 [[A23]], [[B23]]
+; SSE-NEXT:    [[R24:%.*]] = shl i16 [[A24]], [[B24]]
+; SSE-NEXT:    [[R25:%.*]] = shl i16 [[A25]], [[B25]]
+; SSE-NEXT:    [[R26:%.*]] = shl i16 [[A26]], [[B26]]
+; SSE-NEXT:    [[R27:%.*]] = shl i16 [[A27]], [[B27]]
+; SSE-NEXT:    [[R28:%.*]] = shl i16 [[A28]], [[B28]]
+; SSE-NEXT:    [[R29:%.*]] = shl i16 [[A29]], [[B29]]
+; SSE-NEXT:    [[R30:%.*]] = shl i16 [[A30]], [[B30]]
+; SSE-NEXT:    [[R31:%.*]] = shl i16 [[A31]], [[B31]]
+; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @shl_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @shl_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @shl_v32i16(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = shl i16 %a0 , %b0
+  %r1  = shl i16 %a1 , %b1
+  %r2  = shl i16 %a2 , %b2
+  %r3  = shl i16 %a3 , %b3
+  %r4  = shl i16 %a4 , %b4
+  %r5  = shl i16 %a5 , %b5
+  %r6  = shl i16 %a6 , %b6
+  %r7  = shl i16 %a7 , %b7
+  %r8  = shl i16 %a8 , %b8
+  %r9  = shl i16 %a9 , %b9
+  %r10 = shl i16 %a10, %b10
+  %r11 = shl i16 %a11, %b11
+  %r12 = shl i16 %a12, %b12
+  %r13 = shl i16 %a13, %b13
+  %r14 = shl i16 %a14, %b14
+  %r15 = shl i16 %a15, %b15
+  %r16 = shl i16 %a16, %b16
+  %r17 = shl i16 %a17, %b17
+  %r18 = shl i16 %a18, %b18
+  %r19 = shl i16 %a19, %b19
+  %r20 = shl i16 %a20, %b20
+  %r21 = shl i16 %a21, %b21
+  %r22 = shl i16 %a22, %b22
+  %r23 = shl i16 %a23, %b23
+  %r24 = shl i16 %a24, %b24
+  %r25 = shl i16 %a25, %b25
+  %r26 = shl i16 %a26, %b26
+  %r27 = shl i16 %a27, %b27
+  %r28 = shl i16 %a28, %b28
+  %r29 = shl i16 %a29, %b29
+  %r30 = shl i16 %a30, %b30
+  %r31 = shl i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @shl_v64i8() {
+; CHECK-LABEL: @shl_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = shl i8 %a0 , %b0
+  %r1  = shl i8 %a1 , %b1
+  %r2  = shl i8 %a2 , %b2
+  %r3  = shl i8 %a3 , %b3
+  %r4  = shl i8 %a4 , %b4
+  %r5  = shl i8 %a5 , %b5
+  %r6  = shl i8 %a6 , %b6
+  %r7  = shl i8 %a7 , %b7
+  %r8  = shl i8 %a8 , %b8
+  %r9  = shl i8 %a9 , %b9
+  %r10 = shl i8 %a10, %b10
+  %r11 = shl i8 %a11, %b11
+  %r12 = shl i8 %a12, %b12
+  %r13 = shl i8 %a13, %b13
+  %r14 = shl i8 %a14, %b14
+  %r15 = shl i8 %a15, %b15
+  %r16 = shl i8 %a16, %b16
+  %r17 = shl i8 %a17, %b17
+  %r18 = shl i8 %a18, %b18
+  %r19 = shl i8 %a19, %b19
+  %r20 = shl i8 %a20, %b20
+  %r21 = shl i8 %a21, %b21
+  %r22 = shl i8 %a22, %b22
+  %r23 = shl i8 %a23, %b23
+  %r24 = shl i8 %a24, %b24
+  %r25 = shl i8 %a25, %b25
+  %r26 = shl i8 %a26, %b26
+  %r27 = shl i8 %a27, %b27
+  %r28 = shl i8 %a28, %b28
+  %r29 = shl i8 %a29, %b29
+  %r30 = shl i8 %a30, %b30
+  %r31 = shl i8 %a31, %b31
+  %r32 = shl i8 %a32, %b32
+  %r33 = shl i8 %a33, %b33
+  %r34 = shl i8 %a34, %b34
+  %r35 = shl i8 %a35, %b35
+  %r36 = shl i8 %a36, %b36
+  %r37 = shl i8 %a37, %b37
+  %r38 = shl i8 %a38, %b38
+  %r39 = shl i8 %a39, %b39
+  %r40 = shl i8 %a40, %b40
+  %r41 = shl i8 %a41, %b41
+  %r42 = shl i8 %a42, %b42
+  %r43 = shl i8 %a43, %b43
+  %r44 = shl i8 %a44, %b44
+  %r45 = shl i8 %a45, %b45
+  %r46 = shl i8 %a46, %b46
+  %r47 = shl i8 %a47, %b47
+  %r48 = shl i8 %a48, %b48
+  %r49 = shl i8 %a49, %b49
+  %r50 = shl i8 %a50, %b50
+  %r51 = shl i8 %a51, %b51
+  %r52 = shl i8 %a52, %b52
+  %r53 = shl i8 %a53, %b53
+  %r54 = shl i8 %a54, %b54
+  %r55 = shl i8 %a55, %b55
+  %r56 = shl i8 %a56, %b56
+  %r57 = shl i8 %a57, %b57
+  %r58 = shl i8 %a58, %b58
+  %r59 = shl i8 %a59, %b59
+  %r60 = shl i8 %a60, %b60
+  %r61 = shl i8 %a61, %b61
+  %r62 = shl i8 %a62, %b62
+  %r63 = shl i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/sign-extend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/sign-extend.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/sign-extend.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/sign-extend.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S -o - -mtriple=x86_64-apple-macosx10.10.0 -mcpu=core2 | FileCheck %s
+
+define <4 x i32> @sign_extend_v_v(<4 x i16> %lhs) {
+; CHECK-LABEL: @sign_extend_v_v(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[LHS:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[VECINIT9]]
+;
+entry:
+  %vecext = extractelement <4 x i16> %lhs, i32 0
+  %conv = sext i16 %vecext to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %vecext1 = extractelement <4 x i16> %lhs, i32 1
+  %conv2 = sext i16 %vecext1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %vecext4 = extractelement <4 x i16> %lhs, i32 2
+  %conv5 = sext i16 %vecext4 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %vecext7 = extractelement <4 x i16> %lhs, i32 3
+  %conv8 = sext i16 %vecext7 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+}
+
+define <4 x i16> @truncate_v_v(<4 x i32> %lhs) {
+; CHECK-LABEL: @truncate_v_v(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[LHS:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i16> undef, i16 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2
+; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3
+; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x i16> [[VECINIT9]]
+;
+entry:
+  %vecext = extractelement <4 x i32> %lhs, i32 0
+  %conv = trunc i32 %vecext to i16
+  %vecinit = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecext1 = extractelement <4 x i32> %lhs, i32 1
+  %conv2 = trunc i32 %vecext1 to i16
+  %vecinit3 = insertelement <4 x i16> %vecinit, i16 %conv2, i32 1
+  %vecext4 = extractelement <4 x i32> %lhs, i32 2
+  %conv5 = trunc i32 %vecext4 to i16
+  %vecinit6 = insertelement <4 x i16> %vecinit3, i16 %conv5, i32 2
+  %vecext7 = extractelement <4 x i32> %lhs, i32 3
+  %conv8 = trunc i32 %vecext7 to i16
+  %vecinit9 = insertelement <4 x i16> %vecinit6, i16 %conv8, i32 3
+  ret <4 x i16> %vecinit9
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/simple-loop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/simple-loop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/simple-loop.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/simple-loop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i64 %n) {
+; CHECK-LABEL: @rollable(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[TMP10:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[I_019]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP5]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], <i32 7, i32 14, i32 21, i32 28>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10]] = add i64 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP10]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret i32 undef
+;
+  %1 = icmp eq i64 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ]
+  %2 = shl i64 %i.019, 2
+  %3 = getelementptr inbounds i32, i32* %in, i64 %2
+  %4 = load i32, i32* %3, align 4
+  %5 = or i64 %2, 1
+  %6 = getelementptr inbounds i32, i32* %in, i64 %5
+  %7 = load i32, i32* %6, align 4
+  %8 = or i64 %2, 2
+  %9 = getelementptr inbounds i32, i32* %in, i64 %8
+  %10 = load i32, i32* %9, align 4
+  %11 = or i64 %2, 3
+  %12 = getelementptr inbounds i32, i32* %in, i64 %11
+  %13 = load i32, i32* %12, align 4
+  %14 = mul i32 %4, 7
+  %15 = add i32 %14, 7
+  %16 = mul i32 %7, 7
+  %17 = add i32 %16, 14
+  %18 = mul i32 %10, 7
+  %19 = add i32 %18, 21
+  %20 = mul i32 %13, 7
+  %21 = add i32 %20, 28
+  %22 = getelementptr inbounds i32, i32* %out, i64 %2
+  store i32 %15, i32* %22, align 4
+  %23 = getelementptr inbounds i32, i32* %out, i64 %5
+  store i32 %17, i32* %23, align 4
+  %24 = getelementptr inbounds i32, i32* %out, i64 %8
+  store i32 %19, i32* %24, align 4
+  %25 = getelementptr inbounds i32, i32* %out, i64 %11
+  store i32 %21, i32* %25, align 4
+  %26 = add i64 %i.019, 1
+  %exitcond = icmp eq i64 %26, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+define i32 @unrollable(i32* %in, i32* %out, i64 %n) nounwind ssp uwtable {
+; CHECK-LABEL: @unrollable(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[TMP26:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[I_019]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP4]], 7
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 7
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP7]], 7
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 14
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP10]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 21
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP13]], 7
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP20]], 28
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP23]], align 4
+; CHECK-NEXT:    [[BARRIER:%.*]] = call i32 @goo(i32 0)
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP8]]
+; CHECK-NEXT:    store i32 [[TMP19]], i32* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP11]]
+; CHECK-NEXT:    store i32 [[TMP21]], i32* [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP26]] = add i64 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP26]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret i32 undef
+;
+  %1 = icmp eq i64 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ]
+  %2 = shl i64 %i.019, 2
+  %3 = getelementptr inbounds i32, i32* %in, i64 %2
+  %4 = load i32, i32* %3, align 4
+  %5 = or i64 %2, 1
+  %6 = getelementptr inbounds i32, i32* %in, i64 %5
+  %7 = load i32, i32* %6, align 4
+  %8 = or i64 %2, 2
+  %9 = getelementptr inbounds i32, i32* %in, i64 %8
+  %10 = load i32, i32* %9, align 4
+  %11 = or i64 %2, 3
+  %12 = getelementptr inbounds i32, i32* %in, i64 %11
+  %13 = load i32, i32* %12, align 4
+  %14 = mul i32 %4, 7
+  %15 = add i32 %14, 7
+  %16 = mul i32 %7, 7
+  %17 = add i32 %16, 14
+  %18 = mul i32 %10, 7
+  %19 = add i32 %18, 21
+  %20 = mul i32 %13, 7
+  %21 = add i32 %20, 28
+  %22 = getelementptr inbounds i32, i32* %out, i64 %2
+  store i32 %15, i32* %22, align 4
+  %23 = getelementptr inbounds i32, i32* %out, i64 %5
+  store i32 %17, i32* %23, align 4
+  %barrier = call i32 @goo(i32 0)                      ; <---------------- memory barrier.
+  %24 = getelementptr inbounds i32, i32* %out, i64 %8
+  store i32 %19, i32* %24, align 4
+  %25 = getelementptr inbounds i32, i32* %out, i64 %11
+  store i32 %21, i32* %25, align 4
+  %26 = add i64 %i.019, 1
+  %exitcond = icmp eq i64 %26, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+declare i32 @goo(i32)

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/simplebb.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+; Simple 3-pair chain with loads and stores, obfuscated with bitcasts
+define void @test2(double* %a, double* %b, i8* %e) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %c = bitcast i8* %e to double*
+  store double %mul, double* %c, align 8
+  %carrayidx5 = getelementptr inbounds i8, i8* %e, i64 8
+  %arrayidx5 = bitcast i8* %carrayidx5 to double*
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+; Don't vectorize volatile loads.
+define void @test_volatile_load(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test_volatile_load(
+; CHECK-NEXT:    [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    ret void
+;
+  %i0 = load volatile double, double* %a, align 8
+  %i1 = load volatile double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+; Don't vectorize volatile stores.
+define void @test_volatile_store(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test_volatile_store(
+; CHECK-NEXT:    [[I0:%.*]] = load double, double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = load double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT:    store volatile double [[MUL]], double* [[C:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store volatile double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    ret void
+;
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store volatile double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store volatile double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,1262 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+ at src64 = common global [8 x i64] zeroinitializer, align 64
+ at src32 = common global [16 x i32] zeroinitializer, align 64
+ at src16 = common global [32 x i16] zeroinitializer, align 64
+ at src8  = common global [64 x i8] zeroinitializer, align 64
+
+ at dst64 = common global [8 x double] zeroinitializer, align 64
+ at dst32 = common global [16 x float] zeroinitializer, align 64
+
+;
+; SITOFP to vXf64
+;
+
+define void @sitofp_2i64_2f64() #0 {
+; SSE-LABEL: @sitofp_2i64_2f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_2i64_2f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_2i64_2f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
+; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_2i64_2f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
+; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %cvt0 = sitofp i64 %ld0 to double
+  %cvt1 = sitofp i64 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @sitofp_4i64_4f64() #0 {
+; SSE-LABEL: @sitofp_4i64_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_4i64_4f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_4i64_4f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX512-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_4i64_4f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256DQ-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %cvt0 = sitofp i64 %ld0 to double
+  %cvt1 = sitofp i64 %ld1 to double
+  %cvt2 = sitofp i64 %ld2 to double
+  %cvt3 = sitofp i64 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @sitofp_8i64_8f64() #0 {
+; SSE-LABEL: @sitofp_8i64_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_8i64_8f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i64_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_8i64_8f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256DQ-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256DQ-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+  %ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+  %ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+  %ld7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+  %cvt0 = sitofp i64 %ld0 to double
+  %cvt1 = sitofp i64 %ld1 to double
+  %cvt2 = sitofp i64 %ld2 to double
+  %cvt3 = sitofp i64 %ld3 to double
+  %cvt4 = sitofp i64 %ld4 to double
+  %cvt5 = sitofp i64 %ld5 to double
+  %cvt6 = sitofp i64 %ld6 to double
+  %cvt7 = sitofp i64 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sitofp_2i32_2f64() #0 {
+; CHECK-LABEL: @sitofp_2i32_2f64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
+; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %cvt0 = sitofp i32 %ld0 to double
+  %cvt1 = sitofp i32 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @sitofp_4i32_4f64() #0 {
+; SSE-LABEL: @sitofp_4i32_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i32 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i32 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_4i32_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %cvt0 = sitofp i32 %ld0 to double
+  %cvt1 = sitofp i32 %ld1 to double
+  %cvt2 = sitofp i32 %ld2 to double
+  %cvt3 = sitofp i32 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @sitofp_8i32_8f64() #0 {
+; SSE-LABEL: @sitofp_8i32_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i32 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i32 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i32 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i32 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i32 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i32 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_8i32_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i32_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+  %ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+  %ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+  %ld7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+  %cvt0 = sitofp i32 %ld0 to double
+  %cvt1 = sitofp i32 %ld1 to double
+  %cvt2 = sitofp i32 %ld2 to double
+  %cvt3 = sitofp i32 %ld3 to double
+  %cvt4 = sitofp i32 %ld4 to double
+  %cvt5 = sitofp i32 %ld5 to double
+  %cvt6 = sitofp i32 %ld6 to double
+  %cvt7 = sitofp i32 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sitofp_2i16_2f64() #0 {
+; CHECK-LABEL: @sitofp_2i16_2f64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to double
+; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %cvt0 = sitofp i16 %ld0 to double
+  %cvt1 = sitofp i16 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @sitofp_4i16_4f64() #0 {
+; SSE-LABEL: @sitofp_4i16_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_4i16_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %cvt0 = sitofp i16 %ld0 to double
+  %cvt1 = sitofp i16 %ld1 to double
+  %cvt2 = sitofp i16 %ld2 to double
+  %cvt3 = sitofp i16 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @sitofp_8i16_8f64() #0 {
+; SSE-LABEL: @sitofp_8i16_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_8i16_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i16_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+  %ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+  %ld7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+  %cvt0 = sitofp i16 %ld0 to double
+  %cvt1 = sitofp i16 %ld1 to double
+  %cvt2 = sitofp i16 %ld2 to double
+  %cvt3 = sitofp i16 %ld3 to double
+  %cvt4 = sitofp i16 %ld4 to double
+  %cvt5 = sitofp i16 %ld5 to double
+  %cvt6 = sitofp i16 %ld6 to double
+  %cvt7 = sitofp i16 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sitofp_2i8_2f64() #0 {
+; CHECK-LABEL: @sitofp_2i8_2f64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i8 [[LD0]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i8 [[LD1]] to double
+; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %cvt0 = sitofp i8 %ld0 to double
+  %cvt1 = sitofp i8 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @sitofp_4i8_4f64() #0 {
+; SSE-LABEL: @sitofp_4i8_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i8 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i8 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i8 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i8 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_4i8_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %cvt0 = sitofp i8 %ld0 to double
+  %cvt1 = sitofp i8 %ld1 to double
+  %cvt2 = sitofp i8 %ld2 to double
+  %cvt3 = sitofp i8 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @sitofp_8i8_8f64() #0 {
+; SSE-LABEL: @sitofp_8i8_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i8 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i8 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i8 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i8 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i8 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i8 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i8 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i8 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_8i8_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i8_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+  %ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+  %ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+  %ld7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+  %cvt0 = sitofp i8 %ld0 to double
+  %cvt1 = sitofp i8 %ld1 to double
+  %cvt2 = sitofp i8 %ld2 to double
+  %cvt3 = sitofp i8 %ld3 to double
+  %cvt4 = sitofp i8 %ld4 to double
+  %cvt5 = sitofp i8 %ld5 to double
+  %cvt6 = sitofp i8 %ld6 to double
+  %cvt7 = sitofp i8 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+;
+; SITOFP to vXf32
+;
+
+define void @sitofp_2i64_2f32() #0 {
+; CHECK-LABEL: @sitofp_2i64_2f32(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; CHECK-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; CHECK-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %cvt0 = sitofp i64 %ld0 to float
+  %cvt1 = sitofp i64 %ld1 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  ret void
+}
+
+define void @sitofp_4i64_4f32() #0 {
+; SSE-LABEL: @sitofp_4i64_4f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_4i64_4f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_4i64_4f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_4i64_4f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX256DQ-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %cvt0 = sitofp i64 %ld0 to float
+  %cvt1 = sitofp i64 %ld1 to float
+  %cvt2 = sitofp i64 %ld2 to float
+  %cvt3 = sitofp i64 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @sitofp_8i64_8f32() #0 {
+; SSE-LABEL: @sitofp_8i64_8f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_8i64_8f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i64_8f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_8i64_8f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256DQ-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+  %ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+  %ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+  %ld7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+  %cvt0 = sitofp i64 %ld0 to float
+  %cvt1 = sitofp i64 %ld1 to float
+  %cvt2 = sitofp i64 %ld2 to float
+  %cvt3 = sitofp i64 %ld3 to float
+  %cvt4 = sitofp i64 %ld4 to float
+  %cvt5 = sitofp i64 %ld5 to float
+  %cvt6 = sitofp i64 %ld6 to float
+  %cvt7 = sitofp i64 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sitofp_4i32_4f32() #0 {
+; CHECK-LABEL: @sitofp_4i32_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %cvt0 = sitofp i32 %ld0 to float
+  %cvt1 = sitofp i32 %ld1 to float
+  %cvt2 = sitofp i32 %ld2 to float
+  %cvt3 = sitofp i32 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @sitofp_8i32_8f32() #0 {
+; SSE-LABEL: @sitofp_8i32_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_8i32_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+  %ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+  %ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+  %ld7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+  %cvt0 = sitofp i32 %ld0 to float
+  %cvt1 = sitofp i32 %ld1 to float
+  %cvt2 = sitofp i32 %ld2 to float
+  %cvt3 = sitofp i32 %ld3 to float
+  %cvt4 = sitofp i32 %ld4 to float
+  %cvt5 = sitofp i32 %ld5 to float
+  %cvt6 = sitofp i32 %ld6 to float
+  %cvt7 = sitofp i32 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sitofp_16i32_16f32() #0 {
+; SSE-LABEL: @sitofp_16i32_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_16i32_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_16i32_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64
+  %ld1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4
+  %ld2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2 ), align 8
+  %ld3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3 ), align 4
+  %ld4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4 ), align 16
+  %ld5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5 ), align 4
+  %ld6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6 ), align 8
+  %ld7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7 ), align 4
+  %ld8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8 ), align 32
+  %ld9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 9 ), align 4
+  %ld10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 10), align 8
+  %ld11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 11), align 4
+  %ld12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12), align 16
+  %ld13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 13), align 4
+  %ld14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 14), align 8
+  %ld15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 15), align 4
+  %cvt0  = sitofp i32 %ld0  to float
+  %cvt1  = sitofp i32 %ld1  to float
+  %cvt2  = sitofp i32 %ld2  to float
+  %cvt3  = sitofp i32 %ld3  to float
+  %cvt4  = sitofp i32 %ld4  to float
+  %cvt5  = sitofp i32 %ld5  to float
+  %cvt6  = sitofp i32 %ld6  to float
+  %cvt7  = sitofp i32 %ld7  to float
+  %cvt8  = sitofp i32 %ld8  to float
+  %cvt9  = sitofp i32 %ld9  to float
+  %cvt10 = sitofp i32 %ld10 to float
+  %cvt11 = sitofp i32 %ld11 to float
+  %cvt12 = sitofp i32 %ld12 to float
+  %cvt13 = sitofp i32 %ld13 to float
+  %cvt14 = sitofp i32 %ld14 to float
+  %cvt15 = sitofp i32 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @sitofp_4i16_4f32() #0 {
+; CHECK-LABEL: @sitofp_4i16_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %cvt0 = sitofp i16 %ld0 to float
+  %cvt1 = sitofp i16 %ld1 to float
+  %cvt2 = sitofp i16 %ld2 to float
+  %cvt3 = sitofp i16 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @sitofp_8i16_8f32() #0 {
+; SSE-LABEL: @sitofp_8i16_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_8i16_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+  %ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+  %ld7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+  %cvt0 = sitofp i16 %ld0 to float
+  %cvt1 = sitofp i16 %ld1 to float
+  %cvt2 = sitofp i16 %ld2 to float
+  %cvt3 = sitofp i16 %ld3 to float
+  %cvt4 = sitofp i16 %ld4 to float
+  %cvt5 = sitofp i16 %ld5 to float
+  %cvt6 = sitofp i16 %ld6 to float
+  %cvt7 = sitofp i16 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sitofp_16i16_16f32() #0 {
+; SSE-LABEL: @sitofp_16i16_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_16i16_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_16i16_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64
+  %ld1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2 ), align 4
+  %ld3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3 ), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4 ), align 8
+  %ld5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5 ), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6 ), align 4
+  %ld7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7 ), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8 ), align 16
+  %ld9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9 ), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
+  %ld11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
+  %ld13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
+  %ld15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
+  %cvt0  = sitofp i16 %ld0  to float
+  %cvt1  = sitofp i16 %ld1  to float
+  %cvt2  = sitofp i16 %ld2  to float
+  %cvt3  = sitofp i16 %ld3  to float
+  %cvt4  = sitofp i16 %ld4  to float
+  %cvt5  = sitofp i16 %ld5  to float
+  %cvt6  = sitofp i16 %ld6  to float
+  %cvt7  = sitofp i16 %ld7  to float
+  %cvt8  = sitofp i16 %ld8  to float
+  %cvt9  = sitofp i16 %ld9  to float
+  %cvt10 = sitofp i16 %ld10 to float
+  %cvt11 = sitofp i16 %ld11 to float
+  %cvt12 = sitofp i16 %ld12 to float
+  %cvt13 = sitofp i16 %ld13 to float
+  %cvt14 = sitofp i16 %ld14 to float
+  %cvt15 = sitofp i16 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @sitofp_4i8_4f32() #0 {
+; CHECK-LABEL: @sitofp_4i8_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %cvt0 = sitofp i8 %ld0 to float
+  %cvt1 = sitofp i8 %ld1 to float
+  %cvt2 = sitofp i8 %ld2 to float
+  %cvt3 = sitofp i8 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @sitofp_8i8_8f32() #0 {
+; SSE-LABEL: @sitofp_8i8_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_8i8_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+  %ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+  %ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+  %ld7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+  %cvt0 = sitofp i8 %ld0 to float
+  %cvt1 = sitofp i8 %ld1 to float
+  %cvt2 = sitofp i8 %ld2 to float
+  %cvt3 = sitofp i8 %ld3 to float
+  %cvt4 = sitofp i8 %ld4 to float
+  %cvt5 = sitofp i8 %ld5 to float
+  %cvt6 = sitofp i8 %ld6 to float
+  %cvt7 = sitofp i8 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sitofp_16i8_16f32() #0 {
+; SSE-LABEL: @sitofp_16i8_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i8> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_16i8_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_16i8_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64
+  %ld1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2 ), align 2
+  %ld3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3 ), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4 ), align 4
+  %ld5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5 ), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6 ), align 2
+  %ld7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7 ), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8 ), align 8
+  %ld9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 9 ), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 10), align 2
+  %ld11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12), align 4
+  %ld13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 14), align 2
+  %ld15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 15), align 1
+  %cvt0  = sitofp i8 %ld0  to float
+  %cvt1  = sitofp i8 %ld1  to float
+  %cvt2  = sitofp i8 %ld2  to float
+  %cvt3  = sitofp i8 %ld3  to float
+  %cvt4  = sitofp i8 %ld4  to float
+  %cvt5  = sitofp i8 %ld5  to float
+  %cvt6  = sitofp i8 %ld6  to float
+  %cvt7  = sitofp i8 %ld7  to float
+  %cvt8  = sitofp i8 %ld8  to float
+  %cvt9  = sitofp i8 %ld9  to float
+  %cvt10 = sitofp i8 %ld10 to float
+  %cvt11 = sitofp i8 %ld11 to float
+  %cvt12 = sitofp i8 %ld12 to float
+  %cvt13 = sitofp i8 %ld13 to float
+  %cvt14 = sitofp i8 %ld14 to float
+  %cvt15 = sitofp i8 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+;
+; SITOFP BUILDVECTOR
+;
+
+define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
+; CHECK-LABEL: @sitofp_4xi32_4f64(
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double
+; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double
+; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x double> [[RES3]]
+;
+  %cvt0 = sitofp i32 %a0 to double
+  %cvt1 = sitofp i32 %a1 to double
+  %cvt2 = sitofp i32 %a2 to double
+  %cvt3 = sitofp i32 %a3 to double
+  %res0 = insertelement <4 x double> undef, double %cvt0, i32 0
+  %res1 = insertelement <4 x double> %res0, double %cvt1, i32 1
+  %res2 = insertelement <4 x double> %res1, double %cvt2, i32 2
+  %res3 = insertelement <4 x double> %res2, double %cvt3, i32 3
+  ret <4 x double> %res3
+}
+
+define <4 x float> @sitofp_4xi32_4f32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
+; CHECK-LABEL: @sitofp_4xi32_4f32(
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float
+; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float
+; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[RES3]]
+;
+  %cvt0 = sitofp i32 %a0 to float
+  %cvt1 = sitofp i32 %a1 to float
+  %cvt2 = sitofp i32 %a2 to float
+  %cvt3 = sitofp i32 %a3 to float
+  %res0 = insertelement <4 x float> undef, float %cvt0, i32 0
+  %res1 = insertelement <4 x float> %res0, float %cvt1, i32 1
+  %res2 = insertelement <4 x float> %res1, float %cvt2, i32 2
+  %res3 = insertelement <4 x float> %res2, float %cvt3, i32 3
+  ret <4 x float> %res3
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/slp-throttle.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/slp-throttle.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/slp-throttle.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+define dso_local void @rftbsub(double* %a) local_unnamed_addr #0 {
+; CHECK-LABEL: @rftbsub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
+; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
+; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
+; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %arrayidx6 = getelementptr inbounds double, double* %a, i64 2
+  %0 = load double, double* %arrayidx6, align 8
+  %1 = or i64 2, 1
+  %arrayidx12 = getelementptr inbounds double, double* %a, i64 %1
+  %2 = load double, double* %arrayidx12, align 8
+  %add16 = fadd double %2, undef
+  %mul18 = fmul double undef, %add16
+  %add19 = fadd double undef, %mul18
+  %sub22 = fsub double undef, undef
+  %sub25 = fsub double %0, %add19
+  store double %sub25, double* %arrayidx6, align 8
+  %sub29 = fsub double %2, %sub22
+  store double %sub29, double* %arrayidx12, align 8
+  unreachable
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/sqrt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/sqrt.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/sqrt.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/sqrt.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+ at src64 = common global [8 x double] zeroinitializer, align 64
+ at src32 = common global [16 x float] zeroinitializer, align 64
+ at dst64 = common global [8 x double] zeroinitializer, align 64
+ at dst32 = common global [16 x float] zeroinitializer, align 64
+
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.sqrt.f64(double)
+
+;
+; SQRT
+;
+
+define void @sqrt_2f64() #0 {
+; CHECK-LABEL: @sqrt_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %sqrt0 = call double @llvm.sqrt.f64(double %a0)
+  %sqrt1 = call double @llvm.sqrt.f64(double %a1)
+  store double %sqrt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %sqrt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @sqrt_4f64() #0 {
+; SSE-LABEL: @sqrt_4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sqrt_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %sqrt0 = call double @llvm.sqrt.f64(double %a0)
+  %sqrt1 = call double @llvm.sqrt.f64(double %a1)
+  %sqrt2 = call double @llvm.sqrt.f64(double %a2)
+  %sqrt3 = call double @llvm.sqrt.f64(double %a3)
+  store double %sqrt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %sqrt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %sqrt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %sqrt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @sqrt_8f64() #0 {
+; SSE-LABEL: @sqrt_8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; SSE-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP4]])
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sqrt_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
+; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP2]])
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sqrt_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[TMP1]])
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 4
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 4
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 4
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 4
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 4
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 4
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 4
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 4
+  %sqrt0 = call double @llvm.sqrt.f64(double %a0)
+  %sqrt1 = call double @llvm.sqrt.f64(double %a1)
+  %sqrt2 = call double @llvm.sqrt.f64(double %a2)
+  %sqrt3 = call double @llvm.sqrt.f64(double %a3)
+  %sqrt4 = call double @llvm.sqrt.f64(double %a4)
+  %sqrt5 = call double @llvm.sqrt.f64(double %a5)
+  %sqrt6 = call double @llvm.sqrt.f64(double %a6)
+  %sqrt7 = call double @llvm.sqrt.f64(double %a7)
+  store double %sqrt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4
+  store double %sqrt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4
+  store double %sqrt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
+  store double %sqrt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
+  store double %sqrt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
+  store double %sqrt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
+  store double %sqrt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
+  store double %sqrt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sqrt_4f32() #0 {
+; CHECK-LABEL: @sqrt_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %sqrt0 = call float @llvm.sqrt.f32(float %a0)
+  %sqrt1 = call float @llvm.sqrt.f32(float %a1)
+  %sqrt2 = call float @llvm.sqrt.f32(float %a2)
+  %sqrt3 = call float @llvm.sqrt.f32(float %a3)
+  store float %sqrt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %sqrt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %sqrt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %sqrt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @sqrt_8f32() #0 {
+; SSE-LABEL: @sqrt_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sqrt_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %sqrt0 = call float @llvm.sqrt.f32(float %a0)
+  %sqrt1 = call float @llvm.sqrt.f32(float %a1)
+  %sqrt2 = call float @llvm.sqrt.f32(float %a2)
+  %sqrt3 = call float @llvm.sqrt.f32(float %a3)
+  %sqrt4 = call float @llvm.sqrt.f32(float %a4)
+  %sqrt5 = call float @llvm.sqrt.f32(float %a5)
+  %sqrt6 = call float @llvm.sqrt.f32(float %a6)
+  %sqrt7 = call float @llvm.sqrt.f32(float %a7)
+  store float %sqrt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %sqrt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %sqrt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %sqrt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %sqrt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+  store float %sqrt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %sqrt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+  store float %sqrt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sqrt_16f32() #0 {
+; SSE-LABEL: @sqrt_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]])
+; SSE-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP4]])
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sqrt_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
+; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP2]])
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sqrt_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[TMP1]])
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  0), align 4
+  %a1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  1), align 4
+  %a2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  2), align 4
+  %a3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  3), align 4
+  %a4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  4), align 4
+  %a5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  5), align 4
+  %a6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  6), align 4
+  %a7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  7), align 4
+  %a8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  8), align 4
+  %a9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  9), align 4
+  %a10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+  %a11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+  %a12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+  %a13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+  %a14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+  %a15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+  %sqrt0  = call float @llvm.sqrt.f32(float %a0 )
+  %sqrt1  = call float @llvm.sqrt.f32(float %a1 )
+  %sqrt2  = call float @llvm.sqrt.f32(float %a2 )
+  %sqrt3  = call float @llvm.sqrt.f32(float %a3 )
+  %sqrt4  = call float @llvm.sqrt.f32(float %a4 )
+  %sqrt5  = call float @llvm.sqrt.f32(float %a5 )
+  %sqrt6  = call float @llvm.sqrt.f32(float %a6 )
+  %sqrt7  = call float @llvm.sqrt.f32(float %a7 )
+  %sqrt8  = call float @llvm.sqrt.f32(float %a8 )
+  %sqrt9  = call float @llvm.sqrt.f32(float %a9 )
+  %sqrt10 = call float @llvm.sqrt.f32(float %a10)
+  %sqrt11 = call float @llvm.sqrt.f32(float %a11)
+  %sqrt12 = call float @llvm.sqrt.f32(float %a12)
+  %sqrt13 = call float @llvm.sqrt.f32(float %a13)
+  %sqrt14 = call float @llvm.sqrt.f32(float %a14)
+  %sqrt15 = call float @llvm.sqrt.f32(float %a15)
+  store float %sqrt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  0), align 4
+  store float %sqrt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  1), align 4
+  store float %sqrt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  2), align 4
+  store float %sqrt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  3), align 4
+  store float %sqrt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  4), align 4
+  store float %sqrt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  5), align 4
+  store float %sqrt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  6), align 4
+  store float %sqrt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  7), align 4
+  store float %sqrt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  8), align 4
+  store float %sqrt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  9), align 4
+  store float %sqrt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+  store float %sqrt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %sqrt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+  store float %sqrt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %sqrt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+  store float %sqrt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+
+
+define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
+; CHECK-LABEL: @jumbled-load(
+; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
+; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+  %in.addr = getelementptr inbounds i32, i32* %in, i64 0
+  %load.1 = load i32, i32* %in.addr, align 4
+  %gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 1
+  %load.2 = load i32, i32* %gep.1, align 4
+  %gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 2
+  %load.3 = load i32, i32* %gep.2, align 4
+  %gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 3
+  %load.4 = load i32, i32* %gep.3, align 4
+  %inn.addr = getelementptr inbounds i32, i32* %inn, i64 0
+  %load.5 = load i32, i32* %inn.addr, align 4
+  %gep.4 = getelementptr inbounds i32, i32* %inn.addr, i64 1
+  %load.6 = load i32, i32* %gep.4, align 4
+  %gep.5 = getelementptr inbounds i32, i32* %inn.addr, i64 2
+  %load.7 = load i32, i32* %gep.5, align 4
+  %gep.6 = getelementptr inbounds i32, i32* %inn.addr, i64 3
+  %load.8 = load i32, i32* %gep.6, align 4
+  %mul.1 = mul i32 %load.1, %load.5
+  %mul.2 = mul i32 %load.2, %load.6
+  %mul.3 = mul i32 %load.3, %load.7
+  %mul.4 = mul i32 %load.4, %load.8
+  %gep.7 = getelementptr inbounds i32, i32* %out, i64 0
+  %gep.8 = getelementptr inbounds i32, i32* %out, i64 1
+  %gep.9 = getelementptr inbounds i32, i32* %out, i64 2
+  %gep.10 = getelementptr inbounds i32, i32* %out, i64 3
+  store i32 %mul.1, i32* %gep.9, align 4
+  store i32 %mul.2, i32* %gep.7, align 4
+  store i32 %mul.3, i32* %gep.10, align 4
+  store i32 %mul.4, i32* %gep.8, align 4
+
+  ret i32 undef
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,314 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
+
+;void Distance(float *p1, int p2, unsigned long p3[], float p4[]) {
+;  long a = p3[0] = 5;
+;  p1 += p2;
+;  p4[3] += p1[a];
+;  p3[0] >>= 5;
+;  p3[1] >>= 5;
+;  p3[2] >>= 5;
+;  p3[3] >>= 5;
+;  p1 += p2;
+;  p4[0] += p1[p3[0] & a];
+;}
+
+define void @_Z8DistanceIlLi5EEvPfiPmS0_(float* %p1, i32 %p2, i64* %p3, float* %p4) {
+; CHECK-LABEL: @_Z8DistanceIlLi5EEvPfiPmS0_(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 5, i64* [[P3:%.*]], align 8
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[ADD_PTR11:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[TMP6]], 5
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[ADD_PTR11]], i64 [[AND]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[P4]], align 4
+; CHECK-NEXT:    [[ADD15:%.*]] = fadd float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    store float [[ADD15]], float* [[P4]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i64 5, i64* %p3, align 8
+  %idx.ext = sext i32 %p2 to i64
+  %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
+  %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
+  %0 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx2, align 4
+  %2 = load i64, i64* %p3, align 8
+  %shr = lshr i64 %2, 5
+  store i64 %shr, i64* %p3, align 8
+  %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1
+  %3 = load i64, i64* %arrayidx4, align 8
+  %shr5 = lshr i64 %3, 5
+  store i64 %shr5, i64* %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2
+  %4 = load i64, i64* %arrayidx6, align 8
+  %shr7 = lshr i64 %4, 5
+  store i64 %shr7, i64* %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3
+  %5 = load i64, i64* %arrayidx8, align 8
+  %shr9 = lshr i64 %5, 5
+  store i64 %shr9, i64* %arrayidx8, align 8
+  %add.ptr11 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext
+  %and = and i64 %shr, 5
+  %arrayidx13 = getelementptr inbounds float, float* %add.ptr11, i64 %and
+  %6 = load float, float* %arrayidx13, align 4
+  %7 = load float, float* %p4, align 4
+  %add15 = fadd float %6, %7
+  store float %add15, float* %p4, align 4
+  ret void
+}
+
+define void @store_reverse(i64* %p3) {
+; CHECK-LABEL: @store_reverse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[P3:%.*]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* [[TMP7]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i64, i64* %p3, align 8
+  %arrayidx1 = getelementptr inbounds i64, i64* %p3, i64 8
+  %1 = load i64, i64* %arrayidx1, align 8
+  %shl = shl i64 %0, %1
+  %arrayidx2 = getelementptr inbounds i64, i64* %p3, i64 7
+  store i64 %shl, i64* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds i64, i64* %p3, i64 1
+  %2 = load i64, i64* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 9
+  %3 = load i64, i64* %arrayidx4, align 8
+  %shl5 = shl i64 %2, %3
+  %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 6
+  store i64 %shl5, i64* %arrayidx6, align 8
+  %arrayidx7 = getelementptr inbounds i64, i64* %p3, i64 2
+  %4 = load i64, i64* %arrayidx7, align 8
+  %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 10
+  %5 = load i64, i64* %arrayidx8, align 8
+  %shl9 = shl i64 %4, %5
+  %arrayidx10 = getelementptr inbounds i64, i64* %p3, i64 5
+  store i64 %shl9, i64* %arrayidx10, align 8
+  %arrayidx11 = getelementptr inbounds i64, i64* %p3, i64 3
+  %6 = load i64, i64* %arrayidx11, align 8
+  %arrayidx12 = getelementptr inbounds i64, i64* %p3, i64 11
+  %7 = load i64, i64* %arrayidx12, align 8
+  %shl13 = shl i64 %6, %7
+  %arrayidx14 = getelementptr inbounds i64, i64* %p3, i64 4
+  store i64 %shl13, i64* %arrayidx14, align 8
+  ret void
+}
+
+define void @store15(float* %p1, i32 %p2, i64* %p3, float* %p4) {
+; CHECK-LABEL: @store15(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 5, i64* [[P3:%.*]], align 8
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], <i64 5, i64 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i64 [[TMP6]], 5
+; CHECK-NEXT:    store i64 [[SHR7]], i64* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i64 [[TMP7]], 5
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i64 5, i64* %p3, align 8
+  %idx.ext = sext i32 %p2 to i64
+  %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
+  %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
+  %0 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx2, align 4
+  %2 = load i64, i64* %p3, align 8
+  %shr = lshr i64 %2, 5
+  store i64 %shr, i64* %p3, align 8
+  %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1
+  %3 = load i64, i64* %arrayidx4, align 8
+  %shr5 = lshr i64 %3, 5
+  store i64 %shr5, i64* %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2
+  %4 = load i64, i64* %arrayidx6, align 8
+  %shr7 = lshr i64 %4, 5
+  store i64 %shr7, i64* %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3
+  %5 = load i64, i64* %arrayidx8, align 8
+  %shr9 = lshr i64 %5, 5
+  %arrayidx9 = getelementptr inbounds i64, i64* %p3, i64 5
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 %shr9, i64* %arrayidx8, align 8
+  ret void
+}
+
+define void @store16(float* %p1, i32 %p2, i64* %p3, float* %p4) {
+; CHECK-LABEL: @store16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 5, i64* [[P3:%.*]], align 8
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], <i64 5, i64 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i64 [[TMP6]], 5
+; CHECK-NEXT:    store i64 [[SHR7]], i64* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i64 [[TMP7]], 5
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i64 5, i64* %p3, align 8
+  %idx.ext = sext i32 %p2 to i64
+  %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
+  %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
+  %0 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx2, align 4
+  %2 = load i64, i64* %p3, align 8
+  %shr = lshr i64 %2, 5
+  store i64 %shr, i64* %p3, align 8
+  %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1
+  %3 = load i64, i64* %arrayidx4, align 8
+  %shr5 = lshr i64 %3, 5
+  store i64 %shr5, i64* %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2
+  %4 = load i64, i64* %arrayidx6, align 8
+  %shr7 = lshr i64 %4, 5
+  store i64 %shr7, i64* %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3
+  %5 = load i64, i64* %arrayidx8, align 8
+  %shr9 = lshr i64 %5, 5
+  %arrayidx9 = getelementptr inbounds i64, i64* %p3, i64 5
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 %shr9, i64* %arrayidx8, align 8
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/tiny-tree.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/tiny-tree.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/tiny-tree.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+define void @tiny_tree_fully_vectorizable(double* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %count) #0 {
+; CHECK-LABEL: @tiny_tree_fully_vectorizable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP12:%.*]] = icmp eq i64 [[COUNT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[DST_ADDR_014:%.*]] = phi double* [ [[ADD_PTR4:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SRC_ADDR_013:%.*]] = phi double* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[SRC_ADDR_013]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[DST_ADDR_014]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP1]], <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 [[I_015]]
+; CHECK-NEXT:    [[ADD_PTR4]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 [[I_015]]
+; CHECK-NEXT:    [[INC]] = add i64 [[I_015]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp12 = icmp eq i64 %count, 0
+  br i1 %cmp12, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.014 = phi double* [ %add.ptr4, %for.body ], [ %dst, %entry ]
+  %src.addr.013 = phi double* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load double, double* %src.addr.013, align 8
+  store double %0, double* %dst.addr.014, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %src.addr.013, i64 1
+  %1 = load double, double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %dst.addr.014, i64 1
+  store double %1, double* %arrayidx3, align 8
+  %add.ptr = getelementptr inbounds double, double* %src.addr.013, i64 %i.015
+  %add.ptr4 = getelementptr inbounds double, double* %dst.addr.014, i64 %i.015
+  %inc = add i64 %i.015, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define void @tiny_tree_fully_vectorizable2(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %count) #0 {
+; CHECK-LABEL: @tiny_tree_fully_vectorizable2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i64 [[COUNT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP20]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_023:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC_ADDR_021]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[DST_ADDR_022]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
+; CHECK-NEXT:    [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]]
+; CHECK-NEXT:    [[INC]] = add i64 [[I_023]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp20 = icmp eq i64 %count, 0
+  br i1 %cmp20, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.022 = phi float* [ %add.ptr8, %for.body ], [ %dst, %entry ]
+  %src.addr.021 = phi float* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load float, float* %src.addr.021, align 4
+  store float %0, float* %dst.addr.022, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %src.addr.021, i64 1
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %dst.addr.022, i64 1
+  store float %1, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %src.addr.021, i64 2
+  %2 = load float, float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float, float* %dst.addr.022, i64 2
+  store float %2, float* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds float, float* %src.addr.021, i64 3
+  %3 = load float, float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float, float* %dst.addr.022, i64 3
+  store float %3, float* %arrayidx7, align 4
+  %add.ptr = getelementptr inbounds float, float* %src.addr.021, i64 %i.023
+  %add.ptr8 = getelementptr inbounds float, float* %dst.addr.022, i64 %i.023
+  %inc = add i64 %i.023, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; We do not vectorize the tiny tree which is not fully vectorizable.
+
+define void @tiny_tree_not_fully_vectorizable(double* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %count) #0 {
+; CHECK-LABEL: @tiny_tree_not_fully_vectorizable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP12:%.*]] = icmp eq i64 [[COUNT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[DST_ADDR_014:%.*]] = phi double* [ [[ADD_PTR4:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SRC_ADDR_013:%.*]] = phi double* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[SRC_ADDR_013]], align 8
+; CHECK-NEXT:    store double [[TMP0]], double* [[DST_ADDR_014]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1
+; CHECK-NEXT:    store double [[TMP1]], double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 [[I_015]]
+; CHECK-NEXT:    [[ADD_PTR4]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 [[I_015]]
+; CHECK-NEXT:    [[INC]] = add i64 [[I_015]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp12 = icmp eq i64 %count, 0
+  br i1 %cmp12, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.014 = phi double* [ %add.ptr4, %for.body ], [ %dst, %entry ]
+  %src.addr.013 = phi double* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load double, double* %src.addr.013, align 8
+  store double %0, double* %dst.addr.014, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %src.addr.013, i64 2
+  %1 = load double, double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %dst.addr.014, i64 1
+  store double %1, double* %arrayidx3, align 8
+  %add.ptr = getelementptr inbounds double, double* %src.addr.013, i64 %i.015
+  %add.ptr4 = getelementptr inbounds double, double* %dst.addr.014, i64 %i.015
+  %inc = add i64 %i.015, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define void @tiny_tree_not_fully_vectorizable2(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %count) #0 {
+; CHECK-LABEL: @tiny_tree_not_fully_vectorizable2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i64 [[COUNT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP20]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_023:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4
+; CHECK-NEXT:    store float [[TMP0]], float* [[DST_ADDR_022]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
+; CHECK-NEXT:    store float [[TMP1]], float* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
+; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
+; CHECK-NEXT:    store float [[TMP3]], float* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
+; CHECK-NEXT:    [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]]
+; CHECK-NEXT:    [[INC]] = add i64 [[I_023]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp20 = icmp eq i64 %count, 0
+  br i1 %cmp20, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.022 = phi float* [ %add.ptr8, %for.body ], [ %dst, %entry ]
+  %src.addr.021 = phi float* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load float, float* %src.addr.021, align 4
+  store float %0, float* %dst.addr.022, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %src.addr.021, i64 4
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %dst.addr.022, i64 1
+  store float %1, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %src.addr.021, i64 2
+  %2 = load float, float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float, float* %dst.addr.022, i64 2
+  store float %2, float* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds float, float* %src.addr.021, i64 3
+  %3 = load float, float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float, float* %dst.addr.022, i64 3
+  store float %3, float* %arrayidx7, align 4
+  %add.ptr = getelementptr inbounds float, float* %src.addr.021, i64 %i.023
+  %add.ptr8 = getelementptr inbounds float, float* %dst.addr.022, i64 %i.023
+  %inc = add i64 %i.023, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define void @store_splat(float*, float) {
+; CHECK-LABEL: @store_splat(
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> undef, float [[TMP1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP10]], <4 x float>* [[TMP11]], align 4
+; CHECK-NEXT:    ret void
+;
+  %3 = getelementptr inbounds float, float* %0, i64 0
+  store float %1, float* %3, align 4
+  %4 = getelementptr inbounds float, float* %0, i64 1
+  store float %1, float* %4, align 4
+  %5 = getelementptr inbounds float, float* %0, i64 2
+  store float %1, float* %5, align 4
+  %6 = getelementptr inbounds float, float* %0, i64 3
+  store float %1, float* %6, align 4
+  ret void
+}
+
+define void @store_const(i32* %a) {
+; CHECK-LABEL: @store_const(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[PTR0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> <i32 10, i32 30, i32 20, i32 40>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr0 = getelementptr inbounds i32, i32* %a, i64 0
+  store i32 10, i32* %ptr0, align 4
+  %ptr1 = getelementptr inbounds i32, i32* %a, i64 1
+  store i32 30, i32* %ptr1, align 4
+  %ptr2 = getelementptr inbounds i32, i32* %a, i64 2
+  store i32 20, i32* %ptr2, align 4
+  %ptr3 = getelementptr inbounds i32, i32* %a, i64 3
+  store i32 40, i32* %ptr3, align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/uitofp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/uitofp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/uitofp.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/uitofp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,1164 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+ at src64 = common global [8 x i64] zeroinitializer, align 64
+ at src32 = common global [16 x i32] zeroinitializer, align 64
+ at src16 = common global [32 x i16] zeroinitializer, align 64
+ at src8  = common global [64 x i8] zeroinitializer, align 64
+
+ at dst64 = common global [8 x double] zeroinitializer, align 64
+ at dst32 = common global [16 x float] zeroinitializer, align 64
+
+;
+; UITOFP to vXf64
+;
+
+define void @uitofp_2i64_2f64() #0 {
+; CHECK-LABEL: @uitofp_2i64_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %cvt0 = uitofp i64 %ld0 to double
+  %cvt1 = uitofp i64 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @uitofp_4i64_4f64() #0 {
+; SSE-LABEL: @uitofp_4i64_4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_4i64_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %cvt0 = uitofp i64 %ld0 to double
+  %cvt1 = uitofp i64 %ld1 to double
+  %cvt2 = uitofp i64 %ld2 to double
+  %cvt3 = uitofp i64 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @uitofp_8i64_8f64() #0 {
+; SSE-LABEL: @uitofp_8i64_8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i64> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_8i64_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_8i64_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+  %ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+  %ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+  %ld7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+  %cvt0 = uitofp i64 %ld0 to double
+  %cvt1 = uitofp i64 %ld1 to double
+  %cvt2 = uitofp i64 %ld2 to double
+  %cvt3 = uitofp i64 %ld3 to double
+  %cvt4 = uitofp i64 %ld4 to double
+  %cvt5 = uitofp i64 %ld5 to double
+  %cvt6 = uitofp i64 %ld6 to double
+  %cvt7 = uitofp i64 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @uitofp_2i32_2f64() #0 {
+; SSE-LABEL: @uitofp_2i32_2f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @uitofp_2i32_2f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_2i32_2f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_2i32_2f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %cvt0 = uitofp i32 %ld0 to double
+  %cvt1 = uitofp i32 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @uitofp_4i32_4f64() #0 {
+; SSE-LABEL: @uitofp_4i32_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i32 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i32 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_4i32_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %cvt0 = uitofp i32 %ld0 to double
+  %cvt1 = uitofp i32 %ld1 to double
+  %cvt2 = uitofp i32 %ld2 to double
+  %cvt3 = uitofp i32 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @uitofp_8i32_8f64() #0 {
+; SSE-LABEL: @uitofp_8i32_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i32 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i32 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i32 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i32 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i32 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i32 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_8i32_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_8i32_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+  %ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+  %ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+  %ld7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+  %cvt0 = uitofp i32 %ld0 to double
+  %cvt1 = uitofp i32 %ld1 to double
+  %cvt2 = uitofp i32 %ld2 to double
+  %cvt3 = uitofp i32 %ld3 to double
+  %cvt4 = uitofp i32 %ld4 to double
+  %cvt5 = uitofp i32 %ld5 to double
+  %cvt6 = uitofp i32 %ld6 to double
+  %cvt7 = uitofp i32 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @uitofp_2i16_2f64() #0 {
+; CHECK-LABEL: @uitofp_2i16_2f64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
+; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %cvt0 = uitofp i16 %ld0 to double
+  %cvt1 = uitofp i16 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @uitofp_4i16_4f64() #0 {
+; SSE-LABEL: @uitofp_4i16_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_4i16_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %cvt0 = uitofp i16 %ld0 to double
+  %cvt1 = uitofp i16 %ld1 to double
+  %cvt2 = uitofp i16 %ld2 to double
+  %cvt3 = uitofp i16 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @uitofp_8i16_8f64() #0 {
+; SSE-LABEL: @uitofp_8i16_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_8i16_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_8i16_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+  %ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+  %ld7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+  %cvt0 = uitofp i16 %ld0 to double
+  %cvt1 = uitofp i16 %ld1 to double
+  %cvt2 = uitofp i16 %ld2 to double
+  %cvt3 = uitofp i16 %ld3 to double
+  %cvt4 = uitofp i16 %ld4 to double
+  %cvt5 = uitofp i16 %ld5 to double
+  %cvt6 = uitofp i16 %ld6 to double
+  %cvt7 = uitofp i16 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @uitofp_2i8_2f64() #0 {
+; SSE-LABEL: @uitofp_2i8_2f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @uitofp_2i8_2f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_2i8_2f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
+; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_2i8_2f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
+; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %cvt0 = uitofp i8 %ld0 to double
+  %cvt1 = uitofp i8 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @uitofp_4i8_4f64() #0 {
+; SSE-LABEL: @uitofp_4i8_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i8 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i8 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_4i8_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %cvt0 = uitofp i8 %ld0 to double
+  %cvt1 = uitofp i8 %ld1 to double
+  %cvt2 = uitofp i8 %ld2 to double
+  %cvt3 = uitofp i8 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @uitofp_8i8_8f64() #0 {
+; SSE-LABEL: @uitofp_8i8_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i8 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i8 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i8 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i8 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i8 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i8 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_8i8_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_8i8_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+  %ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+  %ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+  %ld7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+  %cvt0 = uitofp i8 %ld0 to double
+  %cvt1 = uitofp i8 %ld1 to double
+  %cvt2 = uitofp i8 %ld2 to double
+  %cvt3 = uitofp i8 %ld3 to double
+  %cvt4 = uitofp i8 %ld4 to double
+  %cvt5 = uitofp i8 %ld5 to double
+  %cvt6 = uitofp i8 %ld6 to double
+  %cvt7 = uitofp i8 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+;
+; UITOFP to vXf32
+;
+
+define void @uitofp_2i64_2f32() #0 {
+; CHECK-LABEL: @uitofp_2i64_2f32(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; CHECK-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; CHECK-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; CHECK-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %cvt0 = uitofp i64 %ld0 to float
+  %cvt1 = uitofp i64 %ld1 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  ret void
+}
+
+define void @uitofp_4i64_4f32() #0 {
+; SSE-LABEL: @uitofp_4i64_4f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @uitofp_4i64_4f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_4i64_4f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_4i64_4f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX256DQ-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %cvt0 = uitofp i64 %ld0 to float
+  %cvt1 = uitofp i64 %ld1 to float
+  %cvt2 = uitofp i64 %ld2 to float
+  %cvt3 = uitofp i64 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @uitofp_8i64_8f32() #0 {
+; SSE-LABEL: @uitofp_8i64_8f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @uitofp_8i64_8f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_8i64_8f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_8i64_8f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256DQ-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+  %ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+  %ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+  %ld7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+  %cvt0 = uitofp i64 %ld0 to float
+  %cvt1 = uitofp i64 %ld1 to float
+  %cvt2 = uitofp i64 %ld2 to float
+  %cvt3 = uitofp i64 %ld3 to float
+  %cvt4 = uitofp i64 %ld4 to float
+  %cvt5 = uitofp i64 %ld5 to float
+  %cvt6 = uitofp i64 %ld6 to float
+  %cvt7 = uitofp i64 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @uitofp_4i32_4f32() #0 {
+; CHECK-LABEL: @uitofp_4i32_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %cvt0 = uitofp i32 %ld0 to float
+  %cvt1 = uitofp i32 %ld1 to float
+  %cvt2 = uitofp i32 %ld2 to float
+  %cvt3 = uitofp i32 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @uitofp_8i32_8f32() #0 {
+; SSE-LABEL: @uitofp_8i32_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_8i32_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+  %ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+  %ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+  %ld7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+  %cvt0 = uitofp i32 %ld0 to float
+  %cvt1 = uitofp i32 %ld1 to float
+  %cvt2 = uitofp i32 %ld2 to float
+  %cvt3 = uitofp i32 %ld3 to float
+  %cvt4 = uitofp i32 %ld4 to float
+  %cvt5 = uitofp i32 %ld5 to float
+  %cvt6 = uitofp i32 %ld6 to float
+  %cvt7 = uitofp i32 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @uitofp_16i32_16f32() #0 {
+; SSE-LABEL: @uitofp_16i32_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i32> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_16i32_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i32> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_16i32_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64
+  %ld1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4
+  %ld2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2 ), align 8
+  %ld3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3 ), align 4
+  %ld4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4 ), align 16
+  %ld5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5 ), align 4
+  %ld6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6 ), align 8
+  %ld7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7 ), align 4
+  %ld8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8 ), align 32
+  %ld9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 9 ), align 4
+  %ld10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 10), align 8
+  %ld11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 11), align 4
+  %ld12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12), align 16
+  %ld13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 13), align 4
+  %ld14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 14), align 8
+  %ld15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 15), align 4
+  %cvt0  = uitofp i32 %ld0  to float
+  %cvt1  = uitofp i32 %ld1  to float
+  %cvt2  = uitofp i32 %ld2  to float
+  %cvt3  = uitofp i32 %ld3  to float
+  %cvt4  = uitofp i32 %ld4  to float
+  %cvt5  = uitofp i32 %ld5  to float
+  %cvt6  = uitofp i32 %ld6  to float
+  %cvt7  = uitofp i32 %ld7  to float
+  %cvt8  = uitofp i32 %ld8  to float
+  %cvt9  = uitofp i32 %ld9  to float
+  %cvt10 = uitofp i32 %ld10 to float
+  %cvt11 = uitofp i32 %ld11 to float
+  %cvt12 = uitofp i32 %ld12 to float
+  %cvt13 = uitofp i32 %ld13 to float
+  %cvt14 = uitofp i32 %ld14 to float
+  %cvt15 = uitofp i32 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @uitofp_4i16_4f32() #0 {
+; CHECK-LABEL: @uitofp_4i16_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %cvt0 = uitofp i16 %ld0 to float
+  %cvt1 = uitofp i16 %ld1 to float
+  %cvt2 = uitofp i16 %ld2 to float
+  %cvt3 = uitofp i16 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @uitofp_8i16_8f32() #0 {
+; SSE-LABEL: @uitofp_8i16_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_8i16_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+  %ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+  %ld7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+  %cvt0 = uitofp i16 %ld0 to float
+  %cvt1 = uitofp i16 %ld1 to float
+  %cvt2 = uitofp i16 %ld2 to float
+  %cvt3 = uitofp i16 %ld3 to float
+  %cvt4 = uitofp i16 %ld4 to float
+  %cvt5 = uitofp i16 %ld5 to float
+  %cvt6 = uitofp i16 %ld6 to float
+  %cvt7 = uitofp i16 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @uitofp_16i16_16f32() #0 {
+; SSE-LABEL: @uitofp_16i16_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i16> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_16i16_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_16i16_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64
+  %ld1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2 ), align 4
+  %ld3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3 ), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4 ), align 8
+  %ld5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5 ), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6 ), align 4
+  %ld7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7 ), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8 ), align 16
+  %ld9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9 ), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
+  %ld11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
+  %ld13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
+  %ld15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
+  %cvt0  = uitofp i16 %ld0  to float
+  %cvt1  = uitofp i16 %ld1  to float
+  %cvt2  = uitofp i16 %ld2  to float
+  %cvt3  = uitofp i16 %ld3  to float
+  %cvt4  = uitofp i16 %ld4  to float
+  %cvt5  = uitofp i16 %ld5  to float
+  %cvt6  = uitofp i16 %ld6  to float
+  %cvt7  = uitofp i16 %ld7  to float
+  %cvt8  = uitofp i16 %ld8  to float
+  %cvt9  = uitofp i16 %ld9  to float
+  %cvt10 = uitofp i16 %ld10 to float
+  %cvt11 = uitofp i16 %ld11 to float
+  %cvt12 = uitofp i16 %ld12 to float
+  %cvt13 = uitofp i16 %ld13 to float
+  %cvt14 = uitofp i16 %ld14 to float
+  %cvt15 = uitofp i16 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @uitofp_4i8_4f32() #0 {
+; CHECK-LABEL: @uitofp_4i8_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %cvt0 = uitofp i8 %ld0 to float
+  %cvt1 = uitofp i8 %ld1 to float
+  %cvt2 = uitofp i8 %ld2 to float
+  %cvt3 = uitofp i8 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @uitofp_8i8_8f32() #0 {
+; SSE-LABEL: @uitofp_8i8_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_8i8_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+  %ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+  %ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+  %ld7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+  %cvt0 = uitofp i8 %ld0 to float
+  %cvt1 = uitofp i8 %ld1 to float
+  %cvt2 = uitofp i8 %ld2 to float
+  %cvt3 = uitofp i8 %ld3 to float
+  %cvt4 = uitofp i8 %ld4 to float
+  %cvt5 = uitofp i8 %ld5 to float
+  %cvt6 = uitofp i8 %ld6 to float
+  %cvt7 = uitofp i8 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @uitofp_16i8_16f32() #0 {
+; SSE-LABEL: @uitofp_16i8_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i8> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_16i8_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i8> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_16i8_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64
+  %ld1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2 ), align 2
+  %ld3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3 ), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4 ), align 4
+  %ld5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5 ), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6 ), align 2
+  %ld7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7 ), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8 ), align 8
+  %ld9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 9 ), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 10), align 2
+  %ld11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12), align 4
+  %ld13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 14), align 2
+  %ld15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 15), align 1
+  %cvt0  = uitofp i8 %ld0  to float
+  %cvt1  = uitofp i8 %ld1  to float
+  %cvt2  = uitofp i8 %ld2  to float
+  %cvt3  = uitofp i8 %ld3  to float
+  %cvt4  = uitofp i8 %ld4  to float
+  %cvt5  = uitofp i8 %ld5  to float
+  %cvt6  = uitofp i8 %ld6  to float
+  %cvt7  = uitofp i8 %ld7  to float
+  %cvt8  = uitofp i8 %ld8  to float
+  %cvt9  = uitofp i8 %ld9  to float
+  %cvt10 = uitofp i8 %ld10 to float
+  %cvt11 = uitofp i8 %ld11 to float
+  %cvt12 = uitofp i8 %ld12 to float
+  %cvt13 = uitofp i8 %ld13 to float
+  %cvt14 = uitofp i8 %ld14 to float
+  %cvt15 = uitofp i8 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/undef_vect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/undef_vect.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/undef_vect.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/undef_vect.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
+
+%"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76" = type { i32, i32 }
+
+define void @_Z2azv() local_unnamed_addr {
+; CHECK-LABEL: @_Z2azv(
+; CHECK-NEXT:  for.body.lr.ph:
+; CHECK-NEXT:    [[DOTSROA_CAST_4:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 4, i32 0
+; CHECK-NEXT:    [[DOTSROA_RAW_IDX_4:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 4, i32 1
+; CHECK-NEXT:    [[DOTSROA_CAST_5:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 5, i32 0
+; CHECK-NEXT:    [[DOTSROA_RAW_IDX_5:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 5, i32 1
+; CHECK-NEXT:    [[DOTSROA_CAST_6:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 6, i32 0
+; CHECK-NEXT:    [[DOTSROA_RAW_IDX_6:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 6, i32 1
+; CHECK-NEXT:    [[DOTSROA_CAST_7:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 0
+; CHECK-NEXT:    [[DOTSROA_RAW_IDX_7:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DOTSROA_CAST_4]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[CMP_I1_4:%.*]] = icmp slt i32 undef, undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_4:%.*]] = select i1 [[CMP_I1_4]], i32 undef, i32 undef
+; CHECK-NEXT:    [[CMP_I1_5:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_4]], undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_5:%.*]] = select i1 [[CMP_I1_5]], i32 undef, i32 [[DOTSROA_SPECULATED_4]]
+; CHECK-NEXT:    [[CMP_I1_6:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_5]], undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_6:%.*]] = select i1 [[CMP_I1_6]], i32 undef, i32 [[DOTSROA_SPECULATED_5]]
+; CHECK-NEXT:    [[CMP_I1_7:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_6]], undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_7:%.*]] = select i1 [[CMP_I1_7]], i32 undef, i32 [[DOTSROA_SPECULATED_6]]
+; CHECK-NEXT:    [[CMP_I1_8:%.*]] = icmp slt i32 undef, undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP1]], <8 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], undef
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 undef
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[OP_EXTRA]], undef
+; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = select i1 [[TMP4]], i32 [[OP_EXTRA]], i32 undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_8:%.*]] = select i1 [[CMP_I1_8]], i32 undef, i32 undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_EXTRA7]]
+; CHECK-NEXT:    [[CMP_I1_10:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_9]], undef
+; CHECK-NEXT:    ret void
+;
+for.body.lr.ph:
+  %.sroa_cast.4 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 4, i32 0
+  %retval.sroa.0.0.copyload.i5.4 = load i32, i32* %.sroa_cast.4, align 4
+  %.sroa_raw_idx.4 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 4, i32 1
+  %retval.sroa.0.0.copyload.i7.4 = load i32, i32* %.sroa_raw_idx.4, align 4
+  %cmp.i2.4 = icmp slt i32 %retval.sroa.0.0.copyload.i5.4, %retval.sroa.0.0.copyload.i7.4
+  %0 = select i1 %cmp.i2.4, i32 %retval.sroa.0.0.copyload.i7.4, i32 %retval.sroa.0.0.copyload.i5.4
+  %cmp.i1.4 = icmp slt i32 undef, %0
+  %.sroa.speculated.4 = select i1 %cmp.i1.4, i32 %0, i32 undef
+  %.sroa_cast.5 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 5, i32 0
+  %retval.sroa.0.0.copyload.i5.5 = load i32, i32* %.sroa_cast.5, align 4
+  %.sroa_raw_idx.5 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 5, i32 1
+  %retval.sroa.0.0.copyload.i7.5 = load i32, i32* %.sroa_raw_idx.5, align 4
+  %cmp.i2.5 = icmp slt i32 %retval.sroa.0.0.copyload.i5.5, %retval.sroa.0.0.copyload.i7.5
+  %1 = select i1 %cmp.i2.5, i32 %retval.sroa.0.0.copyload.i7.5, i32 %retval.sroa.0.0.copyload.i5.5
+  %cmp.i1.5 = icmp slt i32 %.sroa.speculated.4, %1
+  %.sroa.speculated.5 = select i1 %cmp.i1.5, i32 %1, i32 %.sroa.speculated.4
+  %.sroa_cast.6 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 6, i32 0
+  %retval.sroa.0.0.copyload.i5.6 = load i32, i32* %.sroa_cast.6, align 4
+  %.sroa_raw_idx.6 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 6, i32 1
+  %retval.sroa.0.0.copyload.i7.6 = load i32, i32* %.sroa_raw_idx.6, align 4
+  %cmp.i2.6 = icmp slt i32 %retval.sroa.0.0.copyload.i5.6, %retval.sroa.0.0.copyload.i7.6
+  %2 = select i1 %cmp.i2.6, i32 %retval.sroa.0.0.copyload.i7.6, i32 %retval.sroa.0.0.copyload.i5.6
+  %cmp.i1.6 = icmp slt i32 %.sroa.speculated.5, %2
+  %.sroa.speculated.6 = select i1 %cmp.i1.6, i32 %2, i32 %.sroa.speculated.5
+  %.sroa_cast.7 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 0
+  %retval.sroa.0.0.copyload.i5.7 = load i32, i32* %.sroa_cast.7, align 4
+  %.sroa_raw_idx.7 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 1
+  %retval.sroa.0.0.copyload.i7.7 = load i32, i32* %.sroa_raw_idx.7, align 4
+  %cmp.i2.7 = icmp slt i32 %retval.sroa.0.0.copyload.i5.7, %retval.sroa.0.0.copyload.i7.7
+  %3 = select i1 %cmp.i2.7, i32 %retval.sroa.0.0.copyload.i7.7, i32 %retval.sroa.0.0.copyload.i5.7
+  %cmp.i1.7 = icmp slt i32 %.sroa.speculated.6, %3
+  %.sroa.speculated.7 = select i1 %cmp.i1.7, i32 %3, i32 %.sroa.speculated.6
+  %cmp.i1.8 = icmp slt i32 %.sroa.speculated.7, undef
+  %.sroa.speculated.8 = select i1 %cmp.i1.8, i32 undef, i32 %.sroa.speculated.7
+  %.sroa.speculated.9 = select i1 undef, i32 undef, i32 %.sroa.speculated.8
+  %cmp.i1.10 = icmp slt i32 %.sroa.speculated.9, undef
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/unreachable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/unreachable.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/unreachable.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/unreachable.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+; Check if the SLPVectorizer does not crash when handling
+; unreachable blocks with unscheduleable instructions.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+define void @foo(i32* nocapture %x) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 4
+; CHECK-NEXT:    [[T4:%.*]] = load i32, i32* [[T3]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 5
+; CHECK-NEXT:    [[T6:%.*]] = load i32, i32* [[T5]], align 4
+; CHECK-NEXT:    [[BAD:%.*]] = fadd float [[BAD]], 0.000000e+00
+; CHECK-NEXT:    [[T7:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 6
+; CHECK-NEXT:    [[T8:%.*]] = load i32, i32* [[T7]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 7
+; CHECK-NEXT:    [[T10:%.*]] = load i32, i32* [[T9]], align 4
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[T1_0:%.*]] = phi i32 [ [[T4]], [[BB1:%.*]] ], [ 2, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[T2_0:%.*]] = phi i32 [ [[T6]], [[BB1]] ], [ 2, [[ENTRY]] ]
+; CHECK-NEXT:    [[T3_0:%.*]] = phi i32 [ [[T8]], [[BB1]] ], [ 2, [[ENTRY]] ]
+; CHECK-NEXT:    [[T4_0:%.*]] = phi i32 [ [[T10]], [[BB1]] ], [ 2, [[ENTRY]] ]
+; CHECK-NEXT:    store i32 [[T1_0]], i32* [[X]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    store i32 [[T2_0]], i32* [[T12]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    store i32 [[T3_0]], i32* [[T13]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    store i32 [[T4_0]], i32* [[T14]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb2
+
+bb1:                                    ; an unreachable block
+  %t3 = getelementptr inbounds i32, i32* %x, i64 4
+  %t4 = load i32, i32* %t3, align 4
+  %t5 = getelementptr inbounds i32, i32* %x, i64 5
+  %t6 = load i32, i32* %t5, align 4
+  %bad = fadd float %bad, 0.000000e+00  ; <- an instruction with self dependency,
+  ;    but legal in unreachable code
+  %t7 = getelementptr inbounds i32, i32* %x, i64 6
+  %t8 = load i32, i32* %t7, align 4
+  %t9 = getelementptr inbounds i32, i32* %x, i64 7
+  %t10 = load i32, i32* %t9, align 4
+  br label %bb2
+
+bb2:
+  %t1.0 = phi i32 [ %t4, %bb1 ], [ 2, %entry ]
+  %t2.0 = phi i32 [ %t6, %bb1 ], [ 2, %entry ]
+  %t3.0 = phi i32 [ %t8, %bb1 ], [ 2, %entry ]
+  %t4.0 = phi i32 [ %t10, %bb1 ], [ 2, %entry ]
+  store i32 %t1.0, i32* %x, align 4
+  %t12 = getelementptr inbounds i32, i32* %x, i64 1
+  store i32 %t2.0, i32* %t12, align 4
+  %t13 = getelementptr inbounds i32, i32* %x, i64 2
+  store i32 %t3.0, i32* %t13, align 4
+  %t14 = getelementptr inbounds i32, i32* %x, i64 3
+  store i32 %t4.0, i32* %t14, align 4
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/value-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/value-bug.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/value-bug.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/value-bug.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S -mtriple="x86_64-grtev3-linux-gnu" -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; We used to crash on this example because we were building a constant
+; expression during vectorization and the vectorizer expects instructions
+; as elements of the vectorized tree.
+; PR19621
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  bb279:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> undef, float undef, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1
+; CHECK-NEXT:    br label [[BB283:%.*]]
+; CHECK:       bb283:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ]
+; CHECK-NEXT:    br label [[BB284:%.*]]
+; CHECK:       bb284:
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef
+; CHECK-NEXT:    br label [[BB21_I:%.*]]
+; CHECK:       bb21.i:
+; CHECK-NEXT:    br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
+; CHECK:       bb22.i:
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]]
+; CHECK-NEXT:    br label [[BB32_I:%.*]]
+; CHECK:       bb32.i:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
+; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], <double undef, double 0.000000e+00>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], undef
+; CHECK-NEXT:    [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float>
+; CHECK-NEXT:    br label [[BB283]]
+;
+bb279:
+  br label %bb283
+
+bb283:
+  %Av.sroa.8.0 = phi float [ undef, %bb279 ], [ %tmp315, %exit ]
+  %Av.sroa.5.0 = phi float [ undef, %bb279 ], [ %tmp319, %exit ]
+  %Av.sroa.3.0 = phi float [ undef, %bb279 ], [ %tmp307, %exit ]
+  %Av.sroa.0.0 = phi float [ undef, %bb279 ], [ %tmp317, %exit ]
+  br label %bb284
+
+bb284:
+  %tmp7.i = fpext float %Av.sroa.3.0 to double
+  %tmp8.i = fsub double %tmp7.i, undef
+  %tmp9.i = fsub double %tmp8.i, undef
+  %tmp17.i = fpext float %Av.sroa.8.0 to double
+  %tmp19.i = fsub double %tmp17.i, undef
+  %tmp20.i = fsub double %tmp19.i, undef
+  br label %bb21.i
+
+bb21.i:
+  br i1 undef, label %bb22.i, label %exit
+
+bb22.i:
+  %tmp24.i = fadd double undef, %tmp9.i
+  %tmp26.i = fadd double undef, %tmp20.i
+  br label %bb32.i
+
+bb32.i:
+  %xs.0.i = phi double [ %tmp24.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
+  %ys.0.i = phi double [ %tmp26.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
+  br i1 undef, label %bb32.i, label %bb21.i
+
+exit:
+  %tmp303 = fpext float %Av.sroa.0.0 to double
+  %tmp304 = fmul double %tmp303, undef
+  %tmp305 = fadd double undef, %tmp304
+  %tmp306 = fadd double %tmp305, undef
+  %tmp307 = fptrunc double %tmp306 to float
+  %tmp311 = fpext float %Av.sroa.5.0 to double
+  %tmp312 = fmul double %tmp311, 0.000000e+00
+  %tmp313 = fadd double undef, %tmp312
+  %tmp314 = fadd double %tmp313, undef
+  %tmp315 = fptrunc double %tmp314 to float
+  %tmp317 = fptrunc double undef to float
+  %tmp319 = fptrunc double undef to float
+  br label %bb283
+}
+
+; Make sure that we probably handle constant folded vectorized trees. The
+; vectorizer starts at the type (%t2, %t3) and wil constant fold the tree.
+; The code that handles insertelement instructions must handle this.
+define <4 x double> @constant_folding() {
+; CHECK-LABEL: @constant_folding(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double 1.000000e+00, i32 1
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double 2.000000e+00, i32 0
+; CHECK-NEXT:    ret <4 x double> [[I2]]
+;
+entry:
+  %t0 = fadd double 1.000000e+00 , 0.000000e+00
+  %t1 = fadd double 1.000000e+00 , 1.000000e+00
+  %t2 = fmul double %t0, 1.000000e+00
+  %i1 = insertelement <4 x double> undef, double %t2, i32 1
+  %t3 = fmul double %t1, 1.000000e+00
+  %i2 = insertelement <4 x double> %i1, double %t3, i32 0
+  ret <4 x double> %i2
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,984 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+define void @add0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @add0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %add = add nsw i32 %0, 1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %add, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %add3 = add nsw i32 %1, 1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %add3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %add6 = add nsw i32 %2, 2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %add6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %add9 = add nsw i32 %3, 3
+  store i32 %add9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @add1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
+; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %0, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %add3 = add nsw i32 %1, 1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %add3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %add6 = add nsw i32 %2, 2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %add6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %add9 = add nsw i32 %3, 3
+  store i32 %add9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @sub0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %1, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub5 = add nsw i32 %2, -2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = add nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @sub1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 4, i32 -1, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %add = add nsw i32 %0, 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %add, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %sub = add nsw i32 %1, -1
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %sub, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub5 = add nsw i32 %2, -2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = add nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub2(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @sub2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %sub3 = add nsw i32 %1, -1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %sub3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub6 = add nsw i32 %2, -2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %sub9 = add nsw i32 %3, -3
+  store i32 %sub9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @addsub0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @addsub0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %1, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub5 = add nsw i32 %2, -2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = sub nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @addsub1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @addsub1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %sub1 = sub nsw i32 %1, -1
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %sub1, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %2, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = sub nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @mul(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; CHECK-NEXT:    store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %mul = mul nsw i32 %0, 257
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %mul, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %mul3 = mul nsw i32 %1, -3
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %mul3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %2, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %mul9 = mul nsw i32 %3, -9
+  store i32 %mul9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @shl0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @shl0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
+; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %0, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %shl = shl i32 %1, 1
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %shl, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %shl5 = shl i32 %2, 2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %shl5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %shl8 = shl i32 %3, 3
+  store i32 %shl8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @shl1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @shl1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 7, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %shl = shl i32 %0, 7
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %shl, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %shl3 = shl i32 %1, 1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %shl3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %shl6 = shl i32 %2, 2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %shl6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %shl9 = shl i32 %3, 3
+  store i32 %shl9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add0f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, 1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd fast float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd fast float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd fast float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add1f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %0, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd fast float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd fast float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd fast float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub0f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %1, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd fast float %3, -3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub1f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, 4.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub = fadd fast float %1, -1.000000e+00
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub5 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub5, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fadd fast float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub2f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub2f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fadd fast float %1, -1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub6 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fadd fast float %3, -3.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @addsub0f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @addsub0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %1, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub5 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub5, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fsub fast float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @addsub1f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @addsub1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub1 = fsub fast float %1, -1.000000e+00
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub1, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %2, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fsub fast float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @mulf(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @mulf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fmul fast float %0, 2.570000e+02
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fmul fast float %1, -3.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %2, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fmul fast float %3, -9.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add0fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add0fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd float %0, 1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add1fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add1fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], 2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %0, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub0fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub0fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %1, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd float %3, -3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub1fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub1fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd float %0, 4.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub = fadd float %1, -1.000000e+00
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub5 = fadd float %2, -2.000000e+00
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub5, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fadd float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub2fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub2fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fadd float %1, -1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub6 = fadd float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fadd float %3, -3.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @mulfn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @mulfn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fmul float [[TMP0]], 2.570000e+02
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB3:%.*]] = fmul float [[TMP1]], -3.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fmul float %0, 2.570000e+02
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fmul float %1, -3.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %2, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fmul fast float %3, -9.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/vector.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/vector.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/vector.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure that we are not crashing or changing the code.
+define void @test(<4 x i32> %in, <4 x i32> %in2) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[K:%.*]] = icmp eq <4 x i32> [[IN:%.*]], [[IN2:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %k = icmp eq <4 x i32> %in, %in2
+  ret void
+}
+
+define i1 @cmpv2f32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @cmpv2f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[Y0:%.*]] = extractelement <2 x i32> [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[X0]], [[Y0]]
+; CHECK-NEXT:    br i1 [[CMP0]], label [[IF:%.*]], label [[ENDIF:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X]], i32 1
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i32 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[X1]], [[Y1]]
+; CHECK-NEXT:    br label [[ENDIF]]
+; CHECK:       endif:
+; CHECK-NEXT:    [[AND_OF_CMPS:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP1]], [[IF]] ]
+; CHECK-NEXT:    ret i1 [[AND_OF_CMPS]]
+;
+  entry:
+  %x0 = extractelement <2 x i32> %x, i32 0
+  %y0 = extractelement <2 x i32> %y, i32 0
+  %cmp0 = icmp eq i32 %x0, %y0
+  br i1 %cmp0, label %if, label %endif
+
+  if:
+  %x1 = extractelement <2 x i32> %x, i32 1
+  %y1 = extractelement <2 x i32> %y, i32 1
+  %cmp1 = icmp eq i32 %x1, %y1
+  br label %endif
+
+  endif:
+  %and_of_cmps = phi i1 [ false, %entry ], [ %cmp1, %if ]
+  ret i1 %and_of_cmps
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/vector_gep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/vector_gep.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/vector_gep.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/vector_gep.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;RUN: opt < %s -slp-vectorizer -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks that SLP vectorizer does not fail on vector GEP.
+; The GEP has scalar and vector parameters and returns vector of pointers.
+
+; Function Attrs: noreturn readonly uwtable
+define void @_Z3fn1v(i32 %x, <16 x i32*>%y) local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z3fn1v(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV42_LE:%.*]] = sext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[CONV36109_LE:%.*]] = zext i32 2 to i64
+; CHECK-NEXT:    [[VECTORGEP:%.*]] = getelementptr i32, <16 x i32*> [[Y:%.*]], i64 [[CONV36109_LE]]
+; CHECK-NEXT:    [[VECTORGEP208:%.*]] = getelementptr i32, <16 x i32*> [[Y]], i64 [[CONV42_LE]]
+; CHECK-NEXT:    unreachable
+;
+
+entry:
+  %conv42.le = sext i32 %x to i64
+  %conv36109.le = zext i32 2 to i64
+  %VectorGep = getelementptr i32, <16 x i32*> %y, i64 %conv36109.le
+  %VectorGep208 = getelementptr i32, <16 x i32*> %y, i64 %conv42.le
+  unreachable
+}
+
+attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
+; CHECK-NEXT:    [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 undef
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp ult i32 [[COND19]], undef
+; CHECK-NEXT:    [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 undef
+; CHECK-NEXT:    [[CMP25:%.*]] = icmp ult i32 [[COND24]], undef
+; CHECK-NEXT:    [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 undef
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp ult i32 [[COND29]], undef
+; CHECK-NEXT:    [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 undef
+; CHECK-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[COND34]], undef
+; CHECK-NEXT:    [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 undef
+; CHECK-NEXT:    [[CMP40:%.*]] = icmp ult i32 [[COND39]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP10]], <8 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 undef
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 1
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %a1
+  %add2 = add i32 %0, %a2
+  %add4 = add i32 %0, %a3
+  %add6 = add i32 %0, %a4
+  %add8 = add i32 %0, %a5
+  %add10 = add i32 %0, %a6
+  %1 = load i32, i32* %arr, align 4
+  %add12 = add i32 %1, %a7
+  %add14 = add i32 %1, %a8
+  %cmp = icmp ult i32 %add, %add2
+  %cond = select i1 %cmp, i32 %add, i32 %add2
+  %cmp15 = icmp ult i32 %cond, %add4
+  %cond19 = select i1 %cmp15, i32 %cond, i32 %add4
+  %cmp20 = icmp ult i32 %cond19, %add6
+  %cond24 = select i1 %cmp20, i32 %cond19, i32 %add6
+  %cmp25 = icmp ult i32 %cond24, %add8
+  %cond29 = select i1 %cmp25, i32 %cond24, i32 %add8
+  %cmp30 = icmp ult i32 %cond29, %add10
+  %cond34 = select i1 %cmp30, i32 %cond29, i32 %add10
+  %cmp35 = icmp ult i32 %cond34, %add12
+  %cond39 = select i1 %cmp35, i32 %cond34, i32 %add12
+  %cmp40 = icmp ult i32 %cond39, %add14
+  %cond44 = select i1 %cmp40, i32 %cond39, i32 %add14
+  ret i32 %cond44
+}
+
+define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
+; CHECK-LABEL: @foo1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 0, i32 3, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
+; CHECK-NEXT:    [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 undef
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp ult i32 [[COND19]], undef
+; CHECK-NEXT:    [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 undef
+; CHECK-NEXT:    [[CMP25:%.*]] = icmp ult i32 [[COND24]], undef
+; CHECK-NEXT:    [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 undef
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp ult i32 [[COND29]], undef
+; CHECK-NEXT:    [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 undef
+; CHECK-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[COND34]], undef
+; CHECK-NEXT:    [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 undef
+; CHECK-NEXT:    [[CMP40:%.*]] = icmp ult i32 [[COND39]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP10]], <8 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 undef
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 1
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %a1
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add2 = add i32 %1, %a2
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 3
+  %2 = load i32, i32* %arrayidx3, align 4
+  %add4 = add i32 %2, %a3
+  %add6 = add i32 %0, %a4
+  %add8 = add i32 %0, %a5
+  %3 = load i32, i32* %arr, align 4
+  %add10 = add i32 %3, %a6
+  %add12 = add i32 %1, %a7
+  %add14 = add i32 %0, %a8
+  %cmp = icmp ult i32 %add, %add2
+  %cond = select i1 %cmp, i32 %add, i32 %add2
+  %cmp15 = icmp ult i32 %cond, %add4
+  %cond19 = select i1 %cmp15, i32 %cond, i32 %add4
+  %cmp20 = icmp ult i32 %cond19, %add6
+  %cond24 = select i1 %cmp20, i32 %cond19, i32 %add6
+  %cmp25 = icmp ult i32 %cond24, %add8
+  %cond29 = select i1 %cmp25, i32 %cond24, i32 %add8
+  %cmp30 = icmp ult i32 %cond29, %add10
+  %cond34 = select i1 %cmp30, i32 %cond29, i32 %add10
+  %cmp35 = icmp ult i32 %cond34, %add12
+  %cond39 = select i1 %cmp35, i32 %cond34, i32 %add12
+  %cmp40 = icmp ult i32 %cond39, %add14
+  %cond44 = select i1 %cmp40, i32 %cond39, i32 %add14
+  ret i32 %cond44
+}
+
+define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 2, i32 3, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
+; CHECK-NEXT:    [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 undef
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp ult i32 [[COND19]], undef
+; CHECK-NEXT:    [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 undef
+; CHECK-NEXT:    [[CMP25:%.*]] = icmp ult i32 [[COND24]], undef
+; CHECK-NEXT:    [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 undef
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp ult i32 [[COND29]], undef
+; CHECK-NEXT:    [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 undef
+; CHECK-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[COND34]], undef
+; CHECK-NEXT:    [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 undef
+; CHECK-NEXT:    [[CMP40:%.*]] = icmp ult i32 [[COND39]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP10]], <8 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 undef
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 3
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %a1
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add2 = add i32 %1, %a2
+  %add4 = add i32 %0, %a3
+  %2 = load i32, i32* %arr, align 4
+  %add6 = add i32 %2, %a4
+  %arrayidx7 = getelementptr inbounds i32, i32* %arr, i64 1
+  %3 = load i32, i32* %arrayidx7, align 4
+  %add8 = add i32 %3, %a5
+  %add10 = add i32 %2, %a6
+  %add12 = add i32 %1, %a7
+  %add14 = add i32 %3, %a8
+  %cmp = icmp ult i32 %add, %add2
+  %cond = select i1 %cmp, i32 %add, i32 %add2
+  %cmp15 = icmp ult i32 %cond, %add4
+  %cond19 = select i1 %cmp15, i32 %cond, i32 %add4
+  %cmp20 = icmp ult i32 %cond19, %add6
+  %cond24 = select i1 %cmp20, i32 %cond19, i32 %add6
+  %cmp25 = icmp ult i32 %cond24, %add8
+  %cond29 = select i1 %cmp25, i32 %cond24, i32 %add8
+  %cmp30 = icmp ult i32 %cond29, %add10
+  %cond34 = select i1 %cmp30, i32 %cond29, i32 %add10
+  %cmp35 = icmp ult i32 %cond34, %add12
+  %cond39 = select i1 %cmp35, i32 %cond34, i32 %add12
+  %cmp40 = icmp ult i32 %cond39, %add14
+  %cond44 = select i1 %cmp40, i32 %cond39, i32 %add14
+  ret i32 %cond44
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/zext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/zext.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/zext.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/zext.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,785 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
+
+;
+; vXi8
+;
+
+define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
+; SSE2-LABEL: @loadext_2i8_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i8_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i8_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %x0 = zext i8 %i0 to i64
+  %x1 = zext i8 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
+; CHECK-LABEL: @loadext_4i8_to_4i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = zext i8 %i0 to i32
+  %x1 = zext i8 %i1 to i32
+  %x2 = zext i8 %i2 to i32
+  %x3 = zext i8 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
+; SSE2-LABEL: @loadext_4i8_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = zext i8 %i0 to i64
+  %x1 = zext i8 %i1 to i64
+  %x2 = zext i8 %i2 to i64
+  %x3 = zext i8 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
+; CHECK-LABEL: @loadext_8i8_to_8i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i16> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = zext i8 %i0 to i16
+  %x1 = zext i8 %i1 to i16
+  %x2 = zext i8 %i2 to i16
+  %x3 = zext i8 %i3 to i16
+  %x4 = zext i8 %i4 to i16
+  %x5 = zext i8 %i5 to i16
+  %x6 = zext i8 %i6 to i16
+  %x7 = zext i8 %i7 to i16
+  %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
+  %v1 = insertelement <8 x i16>   %v0, i16 %x1, i32 1
+  %v2 = insertelement <8 x i16>   %v1, i16 %x2, i32 2
+  %v3 = insertelement <8 x i16>   %v2, i16 %x3, i32 3
+  %v4 = insertelement <8 x i16>   %v3, i16 %x4, i32 4
+  %v5 = insertelement <8 x i16>   %v4, i16 %x5, i32 5
+  %v6 = insertelement <8 x i16>   %v5, i16 %x6, i32 6
+  %v7 = insertelement <8 x i16>   %v6, i16 %x7, i32 7
+  ret <8 x i16> %v7
+}
+
+define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
+; CHECK-LABEL: @loadext_8i8_to_8i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = zext i8 %i0 to i32
+  %x1 = zext i8 %i1 to i32
+  %x2 = zext i8 %i2 to i32
+  %x3 = zext i8 %i3 to i32
+  %x4 = zext i8 %i4 to i32
+  %x5 = zext i8 %i5 to i32
+  %x6 = zext i8 %i6 to i32
+  %x7 = zext i8 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
+; CHECK-LABEL: @loadext_16i8_to_16i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; CHECK-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; CHECK-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; CHECK-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; CHECK-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; CHECK-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; CHECK-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; CHECK-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; CHECK-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; CHECK-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; CHECK-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; CHECK-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; CHECK-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; CHECK-NEXT:    ret <16 x i16> [[V15]]
+;
+  %p1  = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2  = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3  = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4  = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5  = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6  = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7  = getelementptr inbounds i8, i8* %p0, i64 7
+  %p8  = getelementptr inbounds i8, i8* %p0, i64 8
+  %p9  = getelementptr inbounds i8, i8* %p0, i64 9
+  %p10 = getelementptr inbounds i8, i8* %p0, i64 10
+  %p11 = getelementptr inbounds i8, i8* %p0, i64 11
+  %p12 = getelementptr inbounds i8, i8* %p0, i64 12
+  %p13 = getelementptr inbounds i8, i8* %p0, i64 13
+  %p14 = getelementptr inbounds i8, i8* %p0, i64 14
+  %p15 = getelementptr inbounds i8, i8* %p0, i64 15
+  %i0  = load i8, i8* %p0,  align 1
+  %i1  = load i8, i8* %p1,  align 1
+  %i2  = load i8, i8* %p2,  align 1
+  %i3  = load i8, i8* %p3,  align 1
+  %i4  = load i8, i8* %p4,  align 1
+  %i5  = load i8, i8* %p5,  align 1
+  %i6  = load i8, i8* %p6,  align 1
+  %i7  = load i8, i8* %p7,  align 1
+  %i8  = load i8, i8* %p8,  align 1
+  %i9  = load i8, i8* %p9,  align 1
+  %i10 = load i8, i8* %p10, align 1
+  %i11 = load i8, i8* %p11, align 1
+  %i12 = load i8, i8* %p12, align 1
+  %i13 = load i8, i8* %p13, align 1
+  %i14 = load i8, i8* %p14, align 1
+  %i15 = load i8, i8* %p15, align 1
+  %x0  = zext i8 %i0  to i16
+  %x1  = zext i8 %i1  to i16
+  %x2  = zext i8 %i2  to i16
+  %x3  = zext i8 %i3  to i16
+  %x4  = zext i8 %i4  to i16
+  %x5  = zext i8 %i5  to i16
+  %x6  = zext i8 %i6  to i16
+  %x7  = zext i8 %i7  to i16
+  %x8  = zext i8 %i8  to i16
+  %x9  = zext i8 %i9  to i16
+  %x10 = zext i8 %i10 to i16
+  %x11 = zext i8 %i11 to i16
+  %x12 = zext i8 %i12 to i16
+  %x13 = zext i8 %i13 to i16
+  %x14 = zext i8 %i14 to i16
+  %x15 = zext i8 %i15 to i16
+  %v0  = insertelement <16 x i16> undef, i16 %x0,  i32 0
+  %v1  = insertelement <16 x i16>  %v0,  i16 %x1,  i32 1
+  %v2  = insertelement <16 x i16>  %v1,  i16 %x2,  i32 2
+  %v3  = insertelement <16 x i16>  %v2,  i16 %x3,  i32 3
+  %v4  = insertelement <16 x i16>  %v3,  i16 %x4,  i32 4
+  %v5  = insertelement <16 x i16>  %v4,  i16 %x5,  i32 5
+  %v6  = insertelement <16 x i16>  %v5,  i16 %x6,  i32 6
+  %v7  = insertelement <16 x i16>  %v6,  i16 %x7,  i32 7
+  %v8  = insertelement <16 x i16>  %v7,  i16 %x8,  i32 8
+  %v9  = insertelement <16 x i16>  %v8,  i16 %x9,  i32 9
+  %v10 = insertelement <16 x i16>  %v9,  i16 %x10, i32 10
+  %v11 = insertelement <16 x i16>  %v10, i16 %x11, i32 11
+  %v12 = insertelement <16 x i16>  %v11, i16 %x12, i32 12
+  %v13 = insertelement <16 x i16>  %v12, i16 %x13, i32 13
+  %v14 = insertelement <16 x i16>  %v13, i16 %x14, i32 14
+  %v15 = insertelement <16 x i16>  %v14, i16 %x15, i32 15
+  ret <16 x i16> %v15
+}
+
+;
+; vXi16
+;
+
+define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
+; SSE2-LABEL: @loadext_2i16_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i16_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i16_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %x0 = zext i16 %i0 to i64
+  %x1 = zext i16 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
+; CHECK-LABEL: @loadext_4i16_to_4i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = zext i16 %i0 to i32
+  %x1 = zext i16 %i1 to i32
+  %x2 = zext i16 %i2 to i32
+  %x3 = zext i16 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
+; SSE2-LABEL: @loadext_4i16_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = zext i16 %i0 to i64
+  %x1 = zext i16 %i1 to i64
+  %x2 = zext i16 %i2 to i64
+  %x3 = zext i16 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
+; CHECK-LABEL: @loadext_8i16_to_8i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %p4 = getelementptr inbounds i16, i16* %p0, i64 4
+  %p5 = getelementptr inbounds i16, i16* %p0, i64 5
+  %p6 = getelementptr inbounds i16, i16* %p0, i64 6
+  %p7 = getelementptr inbounds i16, i16* %p0, i64 7
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %i4 = load i16, i16* %p4, align 1
+  %i5 = load i16, i16* %p5, align 1
+  %i6 = load i16, i16* %p6, align 1
+  %i7 = load i16, i16* %p7, align 1
+  %x0 = zext i16 %i0 to i32
+  %x1 = zext i16 %i1 to i32
+  %x2 = zext i16 %i2 to i32
+  %x3 = zext i16 %i3 to i32
+  %x4 = zext i16 %i4 to i32
+  %x5 = zext i16 %i5 to i32
+  %x6 = zext i16 %i6 to i32
+  %x7 = zext i16 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+;
+; vXi32
+;
+
+define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
+; SSE2-LABEL: @loadext_2i32_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i32_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i32_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %x0 = zext i32 %i0 to i64
+  %x1 = zext i32 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
+; SSE2-LABEL: @loadext_4i32_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i32_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX1-LABEL: @loadext_4i32_to_4i64(
+; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX1-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; AVX1-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX1-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
+; AVX1-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX1-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX2-LABEL: @loadext_4i32_to_4i64(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX2-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX512-LABEL: @loadext_4i32_to_4i64(
+; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX512-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %p2 = getelementptr inbounds i32, i32* %p0, i64 2
+  %p3 = getelementptr inbounds i32, i32* %p0, i64 3
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %i2 = load i32, i32* %p2, align 1
+  %i3 = load i32, i32* %p3, align 1
+  %x0 = zext i32 %i0 to i64
+  %x1 = zext i32 %i1 to i64
+  %x2 = zext i32 %i2 to i64
+  %x3 = zext i32 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/XCore/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/XCore/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/XCore/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'XCore' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/SLPVectorizer/XCore/no-vector-registers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/XCore/no-vector-registers.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/XCore/no-vector-registers.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/XCore/no-vector-registers.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=xcore  | FileCheck %s
+
+target datalayout = "e-p:32:32:32-a0:0:32-n32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f16:16:32-f32:32:32-f64:32:32"
+target triple = "xcore"
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I0:%.*]] = load double, double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = load double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/int_sideeffect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/int_sideeffect.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/int_sideeffect.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/int_sideeffect.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -slp-vectorizer -slp-max-reg-size=128 -slp-min-reg-size=128 | FileCheck %s
+
+declare void @llvm.sideeffect()
+
+; SLP vectorization across a @llvm.sideeffect.
+
+define void @test(float* %p) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[P0:%.*]] = getelementptr float, float* [[P:%.*]], i64 0
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr float, float* [[P]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr float, float* [[P]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr float, float* [[P]], i64 3
+; CHECK-NEXT:    call void @llvm.sideeffect()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    call void @llvm.sideeffect()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[P0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+  %p0 = getelementptr float, float* %p, i64 0
+  %p1 = getelementptr float, float* %p, i64 1
+  %p2 = getelementptr float, float* %p, i64 2
+  %p3 = getelementptr float, float* %p, i64 3
+  %l0 = load float, float* %p0
+  %l1 = load float, float* %p1
+  %l2 = load float, float* %p2
+  call void @llvm.sideeffect()
+  %l3 = load float, float* %p3
+  store float %l0, float* %p0
+  call void @llvm.sideeffect()
+  store float %l1, float* %p1
+  store float %l2, float* %p2
+  store float %l3, float* %p3
+  ret void
+}

Added: llvm/trunk/test/Transforms/SROA/address-spaces.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SROA/address-spaces.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SROA/address-spaces.ll (added)
+++ llvm/trunk/test/Transforms/SROA/address-spaces.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,131 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1)
+declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture readonly, i32, i1)
+declare void @llvm.memcpy.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1)
+declare void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1)
+
+
+; Make sure an illegal bitcast isn't introduced
+define void @test_address_space_1_1(<2 x i64> addrspace(1)* %a, i16 addrspace(1)* %b) {
+; CHECK-LABEL: @test_address_space_1_1(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
+  %aa = alloca <2 x i64>, align 16
+  %aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
+  %aaptr = bitcast <2 x i64>* %aa to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i32(i8* align 2 %aaptr, i8 addrspace(1)* align 2 %aptr, i32 16, i1 false)
+  %bptr = bitcast i16 addrspace(1)* %b to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* align 2 %bptr, i8* align 2 %aaptr, i32 16, i1 false)
+  ret void
+}
+
+define void @test_address_space_1_0(<2 x i64> addrspace(1)* %a, i16* %b) {
+; CHECK-LABEL: @test_address_space_1_0(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64>* {{.*}}, align 2
+; CHECK: ret void
+  %aa = alloca <2 x i64>, align 16
+  %aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
+  %aaptr = bitcast <2 x i64>* %aa to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i32(i8* align 2 %aaptr, i8 addrspace(1)* align 2 %aptr, i32 16, i1 false)
+  %bptr = bitcast i16* %b to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %bptr, i8* align 2 %aaptr, i32 16, i1 false)
+  ret void
+}
+
+define void @test_address_space_0_1(<2 x i64>* %a, i16 addrspace(1)* %b) {
+; CHECK-LABEL: @test_address_space_0_1(
+; CHECK: load <2 x i64>, <2 x i64>* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
+  %aa = alloca <2 x i64>, align 16
+  %aptr = bitcast <2 x i64>* %a to i8*
+  %aaptr = bitcast <2 x i64>* %aa to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %aaptr, i8* align 2 %aptr, i32 16, i1 false)
+  %bptr = bitcast i16 addrspace(1)* %b to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* align 2 %bptr, i8* align 2 %aaptr, i32 16, i1 false)
+  ret void
+}
+
+%struct.struct_test_27.0.13 = type { i32, float, i64, i8, [4 x i32] }
+
+; Function Attrs: nounwind
+define void @copy_struct([5 x i64] %in.coerce) {
+; CHECK-LABEL: @copy_struct(
+; CHECK-NOT: memcpy
+for.end:
+  %in = alloca %struct.struct_test_27.0.13, align 8
+  %0 = bitcast %struct.struct_test_27.0.13* %in to [5 x i64]*
+  store [5 x i64] %in.coerce, [5 x i64]* %0, align 8
+  %scevgep9 = getelementptr %struct.struct_test_27.0.13, %struct.struct_test_27.0.13* %in, i32 0, i32 4, i32 0
+  %scevgep910 = bitcast i32* %scevgep9 to i8*
+  call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* align 4 undef, i8* align 4 %scevgep910, i32 16, i1 false)
+  ret void
+}
+ 
+%union.anon = type { i32* }
+
+ at g = common global i32 0, align 4
+ at l = common addrspace(3) global i32 0, align 4
+
+; Make sure an illegal bitcast isn't introduced
+define void @pr27557() {
+; CHECK-LABEL: @pr27557(
+; CHECK: %[[CAST:.*]] = bitcast i32** {{.*}} to i32 addrspace(3)**
+; CHECK: store i32 addrspace(3)* @l, i32 addrspace(3)** %[[CAST]]
+  %1 = alloca %union.anon, align 8
+  %2 = bitcast %union.anon* %1 to i32**
+  store i32* @g, i32** %2, align 8
+  %3 = bitcast %union.anon* %1 to i32 addrspace(3)**
+  store i32 addrspace(3)* @l, i32 addrspace(3)** %3, align 8
+  ret void
+}
+
+; Make sure pre-splitting doesn't try to introduce an illegal bitcast
+define float @presplit(i64 addrspace(1)* %p) {
+entry:
+; CHECK-LABEL: @presplit(
+; CHECK: %[[CAST:.*]] = bitcast i64 addrspace(1)* {{.*}} to i32 addrspace(1)*
+; CHECK: load i32, i32 addrspace(1)* %[[CAST]]
+   %b = alloca i64
+   %b.cast = bitcast i64* %b to [2 x float]*
+   %b.gep1 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 0
+   %b.gep2 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 1
+   %l = load i64, i64 addrspace(1)* %p
+   store i64 %l, i64* %b
+   %f1 = load float, float* %b.gep1
+   %f2 = load float, float* %b.gep2
+   %ret = fadd float %f1, %f2
+   ret float %ret
+}
+
+; Test load from and store to non-zero address space.
+define void @test_load_store_diff_addr_space([2 x float] addrspace(1)* %complex1, [2 x float] addrspace(1)* %complex2) {
+; CHECK-LABEL: @test_load_store_diff_addr_space
+; CHECK-NOT: alloca
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: store i32 %{{.*}}, i32 addrspace(1)*
+; CHECK: store i32 %{{.*}}, i32 addrspace(1)*
+  %a = alloca i64
+  %a.cast = bitcast i64* %a to [2 x float]*
+  %a.gep1 = getelementptr [2 x float], [2 x float]* %a.cast, i32 0, i32 0
+  %a.gep2 = getelementptr [2 x float], [2 x float]* %a.cast, i32 0, i32 1
+  %complex1.gep = getelementptr [2 x float], [2 x float] addrspace(1)* %complex1, i32 0, i32 0
+  %p1 = bitcast float addrspace(1)* %complex1.gep to i64 addrspace(1)*
+  %v1 = load i64, i64 addrspace(1)* %p1
+  store i64 %v1, i64* %a
+  %f1 = load float, float* %a.gep1
+  %f2 = load float, float* %a.gep2
+  %sum = fadd float %f1, %f2
+  store float %sum, float* %a.gep1
+  store float %sum, float* %a.gep2
+  %v2 = load i64, i64* %a
+  %complex2.gep = getelementptr [2 x float], [2 x float] addrspace(1)* %complex2, i32 0, i32 0
+  %p2 = bitcast float addrspace(1)* %complex2.gep to i64 addrspace(1)*
+  store i64 %v2, i64 addrspace(1)* %p2
+  ret void
+}

Added: llvm/trunk/test/Transforms/SROA/alignment.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SROA/alignment.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SROA/alignment.ll (added)
+++ llvm/trunk/test/Transforms/SROA/alignment.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,231 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt -debugify -sroa -S < %s | FileCheck %s -check-prefix DEBUGLOC
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i1)
+
+define void @test1({ i8, i8 }* %a, { i8, i8 }* %b) {
+; CHECK-LABEL: @test1(
+; CHECK: %[[gep_a0:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %a, i64 0, i32 0
+; CHECK: %[[a0:.*]] = load i8, i8* %[[gep_a0]], align 16
+; CHECK: %[[gep_a1:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %a, i64 0, i32 1
+; CHECK: %[[a1:.*]] = load i8, i8* %[[gep_a1]], align 1
+; CHECK: %[[gep_b0:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %b, i64 0, i32 0
+; CHECK: store i8 %[[a0]], i8* %[[gep_b0]], align 16
+; CHECK: %[[gep_b1:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %b, i64 0, i32 1
+; CHECK: store i8 %[[a1]], i8* %[[gep_b1]], align 1
+; CHECK: ret void
+
+entry:
+  %alloca = alloca { i8, i8 }, align 16
+  %gep_a = getelementptr { i8, i8 }, { i8, i8 }* %a, i32 0, i32 0
+  %gep_alloca = getelementptr { i8, i8 }, { i8, i8 }* %alloca, i32 0, i32 0
+  %gep_b = getelementptr { i8, i8 }, { i8, i8 }* %b, i32 0, i32 0
+
+  store i8 420, i8* %gep_alloca, align 16
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %gep_alloca, i8* align 16 %gep_a, i32 2, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %gep_b, i8* align 16 %gep_alloca, i32 2, i1 false)
+  ret void
+}
+
+define void @test2() {
+; CHECK-LABEL: @test2(
+; CHECK: alloca i16
+; CHECK: load i8, i8* %{{.*}}
+; CHECK: store i8 42, i8* %{{.*}}
+; CHECK: ret void
+
+; Check that when sroa rewrites the alloca partition
+; it preserves the original DebugLocation.
+; DEBUGLOC-LABEL: @test2(
+; DEBUGLOC: {{.*}} = alloca {{.*}} !dbg ![[DbgLoc:[0-9]+]]
+; DEBUGLOC-LABEL: }
+;
+; DEBUGLOC: ![[DbgLoc]] = !DILocation(line: 9,
+
+entry:
+  %a = alloca { i8, i8, i8, i8 }, align 2      ; "line 9" to -debugify
+  %gep1 = getelementptr { i8, i8, i8, i8 }, { i8, i8, i8, i8 }* %a, i32 0, i32 1
+  %cast1 = bitcast i8* %gep1 to i16*
+  store volatile i16 0, i16* %cast1
+  %gep2 = getelementptr { i8, i8, i8, i8 }, { i8, i8, i8, i8 }* %a, i32 0, i32 2
+  %result = load i8, i8* %gep2
+  store i8 42, i8* %gep2
+  ret void
+}
+
+define void @PR13920(<2 x i64>* %a, i16* %b) {
+; Test that alignments on memcpy intrinsics get propagated to loads and stores.
+; CHECK-LABEL: @PR13920(
+; CHECK: load <2 x i64>, <2 x i64>* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64>* {{.*}}, align 2
+; CHECK: ret void
+
+entry:
+  %aa = alloca <2 x i64>, align 16
+  %aptr = bitcast <2 x i64>* %a to i8*
+  %aaptr = bitcast <2 x i64>* %aa to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %aaptr, i8* align 2 %aptr, i32 16, i1 false)
+  %bptr = bitcast i16* %b to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %bptr, i8* align 2 %aaptr, i32 16, i1 false)
+  ret void
+}
+
+define void @test3(i8* %x) {
+; Test that when we promote an alloca to a type with lower ABI alignment, we
+; provide the needed explicit alignment that code using the alloca may be
+; expecting. However, also check that any offset within an alloca can in turn
+; reduce the alignment.
+; CHECK-LABEL: @test3(
+; CHECK: alloca [22 x i8], align 8
+; CHECK: alloca [18 x i8], align 2
+; CHECK: ret void
+
+entry:
+  %a = alloca { i8*, i8*, i8* }
+  %b = alloca { i8*, i8*, i8* }
+  %a_raw = bitcast { i8*, i8*, i8* }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %a_raw, i8* align 8 %x, i32 22, i1 false)
+  %b_raw = bitcast { i8*, i8*, i8* }* %b to i8*
+  %b_gep = getelementptr i8, i8* %b_raw, i32 6
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %b_gep, i8* align 2 %x, i32 18, i1 false)
+  ret void
+}
+
+define void @test5() {
+; Test that we preserve underaligned loads and stores when splitting. The use
+; of volatile in this test case is just to force the loads and stores to not be
+; split or promoted out of existence.
+;
+; CHECK-LABEL: @test5(
+; CHECK: alloca [9 x i8]
+; CHECK: alloca [9 x i8]
+; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1
+; CHECK: load volatile i16, i16* %{{.*}}, align 1
+; CHECK: load double, double* %{{.*}}, align 1
+; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1
+; CHECK: load volatile i16, i16* %{{.*}}, align 1
+; CHECK: ret void
+
+entry:
+  %a = alloca [18 x i8]
+  %raw1 = getelementptr inbounds [18 x i8], [18 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  store volatile double 0.0, double* %ptr1, align 1
+  %weird_gep1 = getelementptr inbounds [18 x i8], [18 x i8]* %a, i32 0, i32 7
+  %weird_cast1 = bitcast i8* %weird_gep1 to i16*
+  %weird_load1 = load volatile i16, i16* %weird_cast1, align 1
+
+  %raw2 = getelementptr inbounds [18 x i8], [18 x i8]* %a, i32 0, i32 9
+  %ptr2 = bitcast i8* %raw2 to double*
+  %d1 = load double, double* %ptr1, align 1
+  store volatile double %d1, double* %ptr2, align 1
+  %weird_gep2 = getelementptr inbounds [18 x i8], [18 x i8]* %a, i32 0, i32 16
+  %weird_cast2 = bitcast i8* %weird_gep2 to i16*
+  %weird_load2 = load volatile i16, i16* %weird_cast2, align 1
+
+  ret void
+}
+
+define void @test6() {
+; Test that we promote alignment when the underlying alloca switches to one
+; that innately provides it.
+; CHECK-LABEL: @test6(
+; CHECK: alloca double
+; CHECK: alloca double
+; CHECK-NOT: align
+; CHECK: ret void
+
+entry:
+  %a = alloca [16 x i8]
+  %raw1 = getelementptr inbounds [16 x i8], [16 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  store volatile double 0.0, double* %ptr1, align 1
+
+  %raw2 = getelementptr inbounds [16 x i8], [16 x i8]* %a, i32 0, i32 8
+  %ptr2 = bitcast i8* %raw2 to double*
+  %val = load double, double* %ptr1, align 1
+  store volatile double %val, double* %ptr2, align 1
+
+  ret void
+}
+
+define void @test7(i8* %out) {
+; Test that we properly compute the destination alignment when rewriting
+; memcpys as direct loads or stores.
+; CHECK-LABEL: @test7(
+; CHECK-NOT: alloca
+
+entry:
+  %a = alloca [16 x i8]
+  %raw1 = getelementptr inbounds [16 x i8], [16 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  %raw2 = getelementptr inbounds [16 x i8], [16 x i8]* %a, i32 0, i32 8
+  %ptr2 = bitcast i8* %raw2 to double*
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %raw1, i8* %out, i32 16, i1 false)
+; CHECK: %[[val2:.*]] = load double, double* %{{.*}}, align 1
+; CHECK: %[[val1:.*]] = load double, double* %{{.*}}, align 1
+
+  %val1 = load double, double* %ptr2, align 1
+  %val2 = load double, double* %ptr1, align 1
+
+  store double %val1, double* %ptr1, align 1
+  store double %val2, double* %ptr2, align 1
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %raw1, i32 16, i1 false)
+; CHECK: store double %[[val1]], double* %{{.*}}, align 1
+; CHECK: store double %[[val2]], double* %{{.*}}, align 1
+
+  ret void
+; CHECK: ret void
+}
+
+define void @test8() {
+; CHECK-LABEL: @test8(
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+
+  %ptr = alloca [5 x i32], align 1
+  %ptr.8 = bitcast [5 x i32]* %ptr to i8*
+  call void @populate(i8* %ptr.8)
+  %val = load [5 x i32], [5 x i32]* %ptr, align 1
+  ret void
+}
+
+define void @test9() {
+; CHECK-LABEL: @test9(
+; CHECK: load i32, {{.*}}, align 8
+; CHECK: load i32, {{.*}}, align 4
+; CHECK: load i32, {{.*}}, align 8
+; CHECK: load i32, {{.*}}, align 4
+; CHECK: load i32, {{.*}}, align 8
+
+  %ptr = alloca [5 x i32], align 8
+  %ptr.8 = bitcast [5 x i32]* %ptr to i8*
+  call void @populate(i8* %ptr.8)
+  %val = load [5 x i32], [5 x i32]* %ptr, align 8
+  ret void
+}
+
+define void @test10() {
+; CHECK-LABEL: @test10(
+; CHECK: load i32, {{.*}}, align 2
+; CHECK: load i8, {{.*}}, align 2
+; CHECK: load i8, {{.*}}, align 1
+; CHECK: load i8, {{.*}}, align 2
+; CHECK: load i16, {{.*}}, align 2
+
+  %ptr = alloca {i32, i8, i8, {i8, i16}}, align 2
+  %ptr.8 = bitcast {i32, i8, i8, {i8, i16}}* %ptr to i8*
+  call void @populate(i8* %ptr.8)
+  %val = load {i32, i8, i8, {i8, i16}}, {i32, i8, i8, {i8, i16}}* %ptr, align 2
+  ret void
+}
+
+declare void @populate(i8*)

Added: llvm/trunk/test/Transforms/SROA/alloca-address-space.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SROA/alloca-address-space.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SROA/alloca-address-space.ll (added)
+++ llvm/trunk/test/Transforms/SROA/alloca-address-space.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,113 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-p1:16:16:16-p2:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64-A2"
+
+declare void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* nocapture, i8 addrspace(2)* nocapture readonly, i32, i1)
+declare void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(2)* nocapture readonly, i32, i1)
+declare void @llvm.memcpy.p2i8.p1i8.i32(i8 addrspace(2)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1)
+declare void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1)
+
+
+
+; CHECK-LABEL: @test_address_space_1_1(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
+define void @test_address_space_1_1(<2 x i64> addrspace(1)* %a, i16 addrspace(1)* %b) {
+  %aa = alloca <2 x i64>, align 16, addrspace(2)
+  %aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
+  %aaptr = bitcast <2 x i64> addrspace(2)* %aa to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p1i8.i32(i8 addrspace(2)* align 2 %aaptr, i8 addrspace(1)* align 2 %aptr, i32 16, i1 false)
+  %bptr = bitcast i16 addrspace(1)* %b to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* align 2 %bptr, i8 addrspace(2)* align 2 %aaptr, i32 16, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @test_address_space_1_0(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(2)* {{.*}}, align 2
+; CHECK: ret void
+define void @test_address_space_1_0(<2 x i64> addrspace(1)* %a, i16 addrspace(2)* %b) {
+  %aa = alloca <2 x i64>, align 16, addrspace(2)
+  %aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
+  %aaptr = bitcast <2 x i64> addrspace(2)* %aa to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p1i8.i32(i8 addrspace(2)* align 2 %aaptr, i8 addrspace(1)* align 2 %aptr, i32 16, i1 false)
+  %bptr = bitcast i16 addrspace(2)* %b to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* align 2 %bptr, i8 addrspace(2)* align 2 %aaptr, i32 16, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @test_address_space_0_1(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(2)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
+define void @test_address_space_0_1(<2 x i64> addrspace(2)* %a, i16 addrspace(1)* %b) {
+  %aa = alloca <2 x i64>, align 16, addrspace(2)
+  %aptr = bitcast <2 x i64> addrspace(2)* %a to i8 addrspace(2)*
+  %aaptr = bitcast <2 x i64> addrspace(2)* %aa to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* align 2 %aaptr, i8 addrspace(2)* align 2 %aptr, i32 16, i1 false)
+  %bptr = bitcast i16 addrspace(1)* %b to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* align 2 %bptr, i8 addrspace(2)* align 2 %aaptr, i32 16, i1 false)
+  ret void
+}
+
+%struct.struct_test_27.0.13 = type { i32, float, i64, i8, [4 x i32] }
+
+; CHECK-LABEL: @copy_struct(
+; CHECK-NOT: memcpy
+define void @copy_struct([5 x i64] %in.coerce) {
+for.end:
+  %in = alloca %struct.struct_test_27.0.13, align 8, addrspace(2)
+  %0 = bitcast %struct.struct_test_27.0.13 addrspace(2)* %in to [5 x i64] addrspace(2)*
+  store [5 x i64] %in.coerce, [5 x i64] addrspace(2)* %0, align 8
+  %scevgep9 = getelementptr %struct.struct_test_27.0.13, %struct.struct_test_27.0.13 addrspace(2)* %in, i32 0, i32 4, i32 0
+  %scevgep910 = bitcast i32 addrspace(2)* %scevgep9 to i8 addrspace(2)*
+  call void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* align 4 undef, i8 addrspace(2)* align 4 %scevgep910, i32 16, i1 false)
+  ret void
+}
+
+%union.anon = type { i32* }
+
+ at g = common global i32 0, align 4
+ at l = common addrspace(3) global i32 0, align 4
+
+; Make sure an illegal bitcast isn't introduced
+; CHECK-LABEL: @pr27557(
+; CHECK: %[[CAST:.*]] = bitcast i32* addrspace(2)* {{.*}} to i32 addrspace(3)* addrspace(2)*
+; CHECK: store i32 addrspace(3)* @l, i32 addrspace(3)* addrspace(2)* %[[CAST]]
+define void @pr27557() {
+  %1 = alloca %union.anon, align 8, addrspace(2)
+  %2 = bitcast %union.anon addrspace(2)* %1 to i32* addrspace(2)*
+  store i32* @g, i32* addrspace(2)* %2, align 8
+  %3 = bitcast %union.anon addrspace(2)* %1 to i32 addrspace(3)* addrspace(2)*
+  store i32 addrspace(3)* @l, i32 addrspace(3)* addrspace(2)* %3, align 8
+  ret void
+}
+
+; Test load from and store to non-zero address space.
+define void @test_load_store_diff_addr_space([2 x float] addrspace(1)* %complex1, [2 x float] addrspace(1)* %complex2) {
+; CHECK-LABEL: @test_load_store_diff_addr_space
+; CHECK-NOT: alloca
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: store i32 %{{.*}}, i32 addrspace(1)*
+; CHECK: store i32 %{{.*}}, i32 addrspace(1)*
+  %a0 = alloca [2 x i64], align 8, addrspace(2)
+  %a = getelementptr [2 x i64], [2 x i64] addrspace(2)* %a0, i32 0, i32 0
+  %a.cast = bitcast i64 addrspace(2)* %a to [2 x float] addrspace(2)*
+  %a.gep1 = getelementptr [2 x float], [2 x float] addrspace(2)* %a.cast, i32 0, i32 0
+  %a.gep2 = getelementptr [2 x float], [2 x float] addrspace(2)* %a.cast, i32 0, i32 1
+  %complex1.gep = getelementptr [2 x float], [2 x float] addrspace(1)* %complex1, i32 0, i32 0
+  %p1 = bitcast float addrspace(1)* %complex1.gep to i64 addrspace(1)*
+  %v1 = load i64, i64 addrspace(1)* %p1
+  store i64 %v1, i64 addrspace(2)* %a
+  %f1 = load float, float addrspace(2)* %a.gep1
+  %f2 = load float, float addrspace(2)* %a.gep2
+  %sum = fadd float %f1, %f2
+  store float %sum, float addrspace(2)* %a.gep1
+  store float %sum, float addrspace(2)* %a.gep2
+  %v2 = load i64, i64 addrspace(2)* %a
+  %complex2.gep = getelementptr [2 x float], [2 x float] addrspace(1)* %complex2, i32 0, i32 0
+  %p2 = bitcast float addrspace(1)* %complex2.gep to i64 addrspace(1)*
+  store i64 %v2, i64 addrspace(1)* %p2
+  ret void
+}