[llvm] r325772 - [SLPVectorizer][X86] Add load extend tests (PR36091)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 22 04:19:34 PST 2018


Author: rksimon
Date: Thu Feb 22 04:19:34 2018
New Revision: 325772

URL: http://llvm.org/viewvc/llvm-project?rev=325772&view=rev
Log:
[SLPVectorizer][X86] Add load extend tests (PR36091)

Added:
    llvm/trunk/test/Transforms/SLPVectorizer/X86/sext.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/zext.ll

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/sext.ll?rev=325772&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/sext.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/sext.ll Thu Feb 22 04:19:34 2018
@@ -0,0 +1,911 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
+
+;
+; vXi8
+;
+
+define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
+; SSE2-LABEL: @loadext_2i8_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i8_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i8_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %x0 = sext i8 %i0 to i64
+  %x1 = sext i8 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
+; SSE2-LABEL: @loadext_4i8_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
+; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
+; SSE2-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
+; SSE2-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = sext i8 %i0 to i32
+  %x1 = sext i8 %i1 to i32
+  %x2 = sext i8 %i2 to i32
+  %x3 = sext i8 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
+; SSE2-LABEL: @loadext_4i8_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = sext i8 %i0 to i64
+  %x1 = sext i8 %i1 to i64
+  %x2 = sext i8 %i2 to i64
+  %x3 = sext i8 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
+; CHECK-LABEL: @loadext_8i8_to_8i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i16> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = sext i8 %i0 to i16
+  %x1 = sext i8 %i1 to i16
+  %x2 = sext i8 %i2 to i16
+  %x3 = sext i8 %i3 to i16
+  %x4 = sext i8 %i4 to i16
+  %x5 = sext i8 %i5 to i16
+  %x6 = sext i8 %i6 to i16
+  %x7 = sext i8 %i7 to i16
+  %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
+  %v1 = insertelement <8 x i16>   %v0, i16 %x1, i32 1
+  %v2 = insertelement <8 x i16>   %v1, i16 %x2, i32 2
+  %v3 = insertelement <8 x i16>   %v2, i16 %x3, i32 3
+  %v4 = insertelement <8 x i16>   %v3, i16 %x4, i32 4
+  %v5 = insertelement <8 x i16>   %v4, i16 %x5, i32 5
+  %v6 = insertelement <8 x i16>   %v5, i16 %x6, i32 6
+  %v7 = insertelement <8 x i16>   %v6, i16 %x7, i32 7
+  ret <8 x i16> %v7
+}
+
+define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
+; SSE-LABEL: @loadext_8i8_to_8i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX1-LABEL: @loadext_8i8_to_8i32(
+; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX1-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX1-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX1-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX1-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX1-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; AVX1-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; AVX1-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; AVX1-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; AVX1-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i32
+; AVX1-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i32
+; AVX1-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i32
+; AVX1-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i32
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX1-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX1-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX1-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX1-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX1-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; AVX1-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; AVX1-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; AVX1-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; AVX1-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX2-LABEL: @loadext_8i8_to_8i32(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX2-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX512-LABEL: @loadext_8i8_to_8i32(
+; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX512-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX512-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX512-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX512-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX512-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX512-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX512-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX512-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX512-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX512-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX512-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX512-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX512-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = sext i8 %i0 to i32
+  %x1 = sext i8 %i1 to i32
+  %x2 = sext i8 %i2 to i32
+  %x3 = sext i8 %i3 to i32
+  %x4 = sext i8 %i4 to i32
+  %x5 = sext i8 %i5 to i32
+  %x6 = sext i8 %i6 to i32
+  %x7 = sext i8 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
+; CHECK-LABEL: @loadext_16i8_to_16i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; CHECK-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; CHECK-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; CHECK-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; CHECK-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; CHECK-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; CHECK-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; CHECK-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; CHECK-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; CHECK-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; CHECK-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; CHECK-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; CHECK-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; CHECK-NEXT:    ret <16 x i16> [[V15]]
+;
+  %p1  = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2  = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3  = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4  = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5  = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6  = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7  = getelementptr inbounds i8, i8* %p0, i64 7
+  %p8  = getelementptr inbounds i8, i8* %p0, i64 8
+  %p9  = getelementptr inbounds i8, i8* %p0, i64 9
+  %p10 = getelementptr inbounds i8, i8* %p0, i64 10
+  %p11 = getelementptr inbounds i8, i8* %p0, i64 11
+  %p12 = getelementptr inbounds i8, i8* %p0, i64 12
+  %p13 = getelementptr inbounds i8, i8* %p0, i64 13
+  %p14 = getelementptr inbounds i8, i8* %p0, i64 14
+  %p15 = getelementptr inbounds i8, i8* %p0, i64 15
+  %i0  = load i8, i8* %p0,  align 1
+  %i1  = load i8, i8* %p1,  align 1
+  %i2  = load i8, i8* %p2,  align 1
+  %i3  = load i8, i8* %p3,  align 1
+  %i4  = load i8, i8* %p4,  align 1
+  %i5  = load i8, i8* %p5,  align 1
+  %i6  = load i8, i8* %p6,  align 1
+  %i7  = load i8, i8* %p7,  align 1
+  %i8  = load i8, i8* %p8,  align 1
+  %i9  = load i8, i8* %p9,  align 1
+  %i10 = load i8, i8* %p10, align 1
+  %i11 = load i8, i8* %p11, align 1
+  %i12 = load i8, i8* %p12, align 1
+  %i13 = load i8, i8* %p13, align 1
+  %i14 = load i8, i8* %p14, align 1
+  %i15 = load i8, i8* %p15, align 1
+  %x0  = sext i8 %i0  to i16
+  %x1  = sext i8 %i1  to i16
+  %x2  = sext i8 %i2  to i16
+  %x3  = sext i8 %i3  to i16
+  %x4  = sext i8 %i4  to i16
+  %x5  = sext i8 %i5  to i16
+  %x6  = sext i8 %i6  to i16
+  %x7  = sext i8 %i7  to i16
+  %x8  = sext i8 %i8  to i16
+  %x9  = sext i8 %i9  to i16
+  %x10 = sext i8 %i10 to i16
+  %x11 = sext i8 %i11 to i16
+  %x12 = sext i8 %i12 to i16
+  %x13 = sext i8 %i13 to i16
+  %x14 = sext i8 %i14 to i16
+  %x15 = sext i8 %i15 to i16
+  %v0  = insertelement <16 x i16> undef, i16 %x0,  i32 0
+  %v1  = insertelement <16 x i16>  %v0,  i16 %x1,  i32 1
+  %v2  = insertelement <16 x i16>  %v1,  i16 %x2,  i32 2
+  %v3  = insertelement <16 x i16>  %v2,  i16 %x3,  i32 3
+  %v4  = insertelement <16 x i16>  %v3,  i16 %x4,  i32 4
+  %v5  = insertelement <16 x i16>  %v4,  i16 %x5,  i32 5
+  %v6  = insertelement <16 x i16>  %v5,  i16 %x6,  i32 6
+  %v7  = insertelement <16 x i16>  %v6,  i16 %x7,  i32 7
+  %v8  = insertelement <16 x i16>  %v7,  i16 %x8,  i32 8
+  %v9  = insertelement <16 x i16>  %v8,  i16 %x9,  i32 9
+  %v10 = insertelement <16 x i16>  %v9,  i16 %x10, i32 10
+  %v11 = insertelement <16 x i16>  %v10, i16 %x11, i32 11
+  %v12 = insertelement <16 x i16>  %v11, i16 %x12, i32 12
+  %v13 = insertelement <16 x i16>  %v12, i16 %x13, i32 13
+  %v14 = insertelement <16 x i16>  %v13, i16 %x14, i32 14
+  %v15 = insertelement <16 x i16>  %v14, i16 %x15, i32 15
+  ret <16 x i16> %v15
+}
+
+;
+; vXi16
+;
+
+define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
+; SSE2-LABEL: @loadext_2i16_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i16_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i16_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %x0 = sext i16 %i0 to i64
+  %x1 = sext i16 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
+; CHECK-LABEL: @loadext_4i16_to_4i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = sext i16 %i0 to i32
+  %x1 = sext i16 %i1 to i32
+  %x2 = sext i16 %i2 to i32
+  %x3 = sext i16 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
+; SSE2-LABEL: @loadext_4i16_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = sext i16 %i0 to i64
+  %x1 = sext i16 %i1 to i64
+  %x2 = sext i16 %i2 to i64
+  %x3 = sext i16 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
+; CHECK-LABEL: @loadext_8i16_to_8i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %p4 = getelementptr inbounds i16, i16* %p0, i64 4
+  %p5 = getelementptr inbounds i16, i16* %p0, i64 5
+  %p6 = getelementptr inbounds i16, i16* %p0, i64 6
+  %p7 = getelementptr inbounds i16, i16* %p0, i64 7
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %i4 = load i16, i16* %p4, align 1
+  %i5 = load i16, i16* %p5, align 1
+  %i6 = load i16, i16* %p6, align 1
+  %i7 = load i16, i16* %p7, align 1
+  %x0 = sext i16 %i0 to i32
+  %x1 = sext i16 %i1 to i32
+  %x2 = sext i16 %i2 to i32
+  %x3 = sext i16 %i3 to i32
+  %x4 = sext i16 %i4 to i32
+  %x5 = sext i16 %i5 to i32
+  %x6 = sext i16 %i6 to i32
+  %x7 = sext i16 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+;
+; vXi32
+;
+
+define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
+; SSE2-LABEL: @loadext_2i32_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i32_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i32_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %x0 = sext i32 %i0 to i64
+  %x1 = sext i32 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
+; SSE2-LABEL: @loadext_4i32_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i32_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX1-LABEL: @loadext_4i32_to_4i64(
+; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX1-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; AVX1-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX1-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
+; AVX1-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX1-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX2-LABEL: @loadext_4i32_to_4i64(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX2-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX512-LABEL: @loadext_4i32_to_4i64(
+; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX512-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %p2 = getelementptr inbounds i32, i32* %p0, i64 2
+  %p3 = getelementptr inbounds i32, i32* %p0, i64 3
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %i2 = load i32, i32* %p2, align 1
+  %i3 = load i32, i32* %p3, align 1
+  %x0 = sext i32 %i0 to i64
+  %x1 = sext i32 %i1 to i64
+  %x2 = sext i32 %i2 to i64
+  %x3 = sext i32 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/zext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/zext.ll?rev=325772&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/zext.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/zext.ll Thu Feb 22 04:19:34 2018
@@ -0,0 +1,785 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
+
+;
+; vXi8
+;
+
+define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
+; SSE2-LABEL: @loadext_2i8_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i8_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i8_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %x0 = zext i8 %i0 to i64
+  %x1 = zext i8 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
+; CHECK-LABEL: @loadext_4i8_to_4i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = zext i8 %i0 to i32
+  %x1 = zext i8 %i1 to i32
+  %x2 = zext i8 %i2 to i32
+  %x3 = zext i8 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
+; SSE2-LABEL: @loadext_4i8_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = zext i8 %i0 to i64
+  %x1 = zext i8 %i1 to i64
+  %x2 = zext i8 %i2 to i64
+  %x3 = zext i8 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
+; CHECK-LABEL: @loadext_8i8_to_8i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i16> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = zext i8 %i0 to i16
+  %x1 = zext i8 %i1 to i16
+  %x2 = zext i8 %i2 to i16
+  %x3 = zext i8 %i3 to i16
+  %x4 = zext i8 %i4 to i16
+  %x5 = zext i8 %i5 to i16
+  %x6 = zext i8 %i6 to i16
+  %x7 = zext i8 %i7 to i16
+  %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
+  %v1 = insertelement <8 x i16>   %v0, i16 %x1, i32 1
+  %v2 = insertelement <8 x i16>   %v1, i16 %x2, i32 2
+  %v3 = insertelement <8 x i16>   %v2, i16 %x3, i32 3
+  %v4 = insertelement <8 x i16>   %v3, i16 %x4, i32 4
+  %v5 = insertelement <8 x i16>   %v4, i16 %x5, i32 5
+  %v6 = insertelement <8 x i16>   %v5, i16 %x6, i32 6
+  %v7 = insertelement <8 x i16>   %v6, i16 %x7, i32 7
+  ret <8 x i16> %v7
+}
+
+define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
+; CHECK-LABEL: @loadext_8i8_to_8i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = zext i8 %i0 to i32
+  %x1 = zext i8 %i1 to i32
+  %x2 = zext i8 %i2 to i32
+  %x3 = zext i8 %i3 to i32
+  %x4 = zext i8 %i4 to i32
+  %x5 = zext i8 %i5 to i32
+  %x6 = zext i8 %i6 to i32
+  %x7 = zext i8 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
+; CHECK-LABEL: @loadext_16i8_to_16i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; CHECK-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; CHECK-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; CHECK-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; CHECK-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; CHECK-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; CHECK-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; CHECK-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; CHECK-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; CHECK-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; CHECK-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; CHECK-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; CHECK-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; CHECK-NEXT:    ret <16 x i16> [[V15]]
+;
+  %p1  = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2  = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3  = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4  = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5  = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6  = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7  = getelementptr inbounds i8, i8* %p0, i64 7
+  %p8  = getelementptr inbounds i8, i8* %p0, i64 8
+  %p9  = getelementptr inbounds i8, i8* %p0, i64 9
+  %p10 = getelementptr inbounds i8, i8* %p0, i64 10
+  %p11 = getelementptr inbounds i8, i8* %p0, i64 11
+  %p12 = getelementptr inbounds i8, i8* %p0, i64 12
+  %p13 = getelementptr inbounds i8, i8* %p0, i64 13
+  %p14 = getelementptr inbounds i8, i8* %p0, i64 14
+  %p15 = getelementptr inbounds i8, i8* %p0, i64 15
+  %i0  = load i8, i8* %p0,  align 1
+  %i1  = load i8, i8* %p1,  align 1
+  %i2  = load i8, i8* %p2,  align 1
+  %i3  = load i8, i8* %p3,  align 1
+  %i4  = load i8, i8* %p4,  align 1
+  %i5  = load i8, i8* %p5,  align 1
+  %i6  = load i8, i8* %p6,  align 1
+  %i7  = load i8, i8* %p7,  align 1
+  %i8  = load i8, i8* %p8,  align 1
+  %i9  = load i8, i8* %p9,  align 1
+  %i10 = load i8, i8* %p10, align 1
+  %i11 = load i8, i8* %p11, align 1
+  %i12 = load i8, i8* %p12, align 1
+  %i13 = load i8, i8* %p13, align 1
+  %i14 = load i8, i8* %p14, align 1
+  %i15 = load i8, i8* %p15, align 1
+  %x0  = zext i8 %i0  to i16
+  %x1  = zext i8 %i1  to i16
+  %x2  = zext i8 %i2  to i16
+  %x3  = zext i8 %i3  to i16
+  %x4  = zext i8 %i4  to i16
+  %x5  = zext i8 %i5  to i16
+  %x6  = zext i8 %i6  to i16
+  %x7  = zext i8 %i7  to i16
+  %x8  = zext i8 %i8  to i16
+  %x9  = zext i8 %i9  to i16
+  %x10 = zext i8 %i10 to i16
+  %x11 = zext i8 %i11 to i16
+  %x12 = zext i8 %i12 to i16
+  %x13 = zext i8 %i13 to i16
+  %x14 = zext i8 %i14 to i16
+  %x15 = zext i8 %i15 to i16
+  %v0  = insertelement <16 x i16> undef, i16 %x0,  i32 0
+  %v1  = insertelement <16 x i16>  %v0,  i16 %x1,  i32 1
+  %v2  = insertelement <16 x i16>  %v1,  i16 %x2,  i32 2
+  %v3  = insertelement <16 x i16>  %v2,  i16 %x3,  i32 3
+  %v4  = insertelement <16 x i16>  %v3,  i16 %x4,  i32 4
+  %v5  = insertelement <16 x i16>  %v4,  i16 %x5,  i32 5
+  %v6  = insertelement <16 x i16>  %v5,  i16 %x6,  i32 6
+  %v7  = insertelement <16 x i16>  %v6,  i16 %x7,  i32 7
+  %v8  = insertelement <16 x i16>  %v7,  i16 %x8,  i32 8
+  %v9  = insertelement <16 x i16>  %v8,  i16 %x9,  i32 9
+  %v10 = insertelement <16 x i16>  %v9,  i16 %x10, i32 10
+  %v11 = insertelement <16 x i16>  %v10, i16 %x11, i32 11
+  %v12 = insertelement <16 x i16>  %v11, i16 %x12, i32 12
+  %v13 = insertelement <16 x i16>  %v12, i16 %x13, i32 13
+  %v14 = insertelement <16 x i16>  %v13, i16 %x14, i32 14
+  %v15 = insertelement <16 x i16>  %v14, i16 %x15, i32 15
+  ret <16 x i16> %v15
+}
+
+;
+; vXi16
+;
+
+define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
+; SSE2-LABEL: @loadext_2i16_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i16_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i16_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %x0 = zext i16 %i0 to i64
+  %x1 = zext i16 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
+; CHECK-LABEL: @loadext_4i16_to_4i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = zext i16 %i0 to i32
+  %x1 = zext i16 %i1 to i32
+  %x2 = zext i16 %i2 to i32
+  %x3 = zext i16 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
+; SSE2-LABEL: @loadext_4i16_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = zext i16 %i0 to i64
+  %x1 = zext i16 %i1 to i64
+  %x2 = zext i16 %i2 to i64
+  %x3 = zext i16 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
+; CHECK-LABEL: @loadext_8i16_to_8i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %p4 = getelementptr inbounds i16, i16* %p0, i64 4
+  %p5 = getelementptr inbounds i16, i16* %p0, i64 5
+  %p6 = getelementptr inbounds i16, i16* %p0, i64 6
+  %p7 = getelementptr inbounds i16, i16* %p0, i64 7
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %i4 = load i16, i16* %p4, align 1
+  %i5 = load i16, i16* %p5, align 1
+  %i6 = load i16, i16* %p6, align 1
+  %i7 = load i16, i16* %p7, align 1
+  %x0 = zext i16 %i0 to i32
+  %x1 = zext i16 %i1 to i32
+  %x2 = zext i16 %i2 to i32
+  %x3 = zext i16 %i3 to i32
+  %x4 = zext i16 %i4 to i32
+  %x5 = zext i16 %i5 to i32
+  %x6 = zext i16 %i6 to i32
+  %x7 = zext i16 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+;
+; vXi32
+;
+
+define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
+; SSE2-LABEL: @loadext_2i32_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i32_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i32_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %x0 = zext i32 %i0 to i64
+  %x1 = zext i32 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
+; SSE2-LABEL: @loadext_4i32_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i32_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX1-LABEL: @loadext_4i32_to_4i64(
+; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX1-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; AVX1-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX1-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
+; AVX1-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX1-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX2-LABEL: @loadext_4i32_to_4i64(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX2-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX512-LABEL: @loadext_4i32_to_4i64(
+; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX512-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %p2 = getelementptr inbounds i32, i32* %p0, i64 2
+  %p3 = getelementptr inbounds i32, i32* %p0, i64 3
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %i2 = load i32, i32* %p2, align 1
+  %i3 = load i32, i32* %p3, align 1
+  %x0 = zext i32 %i0 to i64
+  %x1 = zext i32 %i1 to i64
+  %x2 = zext i32 %i2 to i64
+  %x3 = zext i32 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}




More information about the llvm-commits mailing list