[llvm] 2078c4e - [X86] Lower insertions into upper half of an 256-bit vector as broadcast+blend (PR50971)
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 17 08:45:20 PDT 2021
Author: Roman Lebedev
Date: 2021-08-17T18:45:10+03:00
New Revision: 2078c4ecfda80f802febc4f98e4a163656093c43
URL: https://github.com/llvm/llvm-project/commit/2078c4ecfda80f802febc4f98e4a163656093c43
DIFF: https://github.com/llvm/llvm-project/commit/2078c4ecfda80f802febc4f98e4a163656093c43.diff
LOG: [X86] Lower insertions into upper half of an 256-bit vector as broadcast+blend (PR50971)
Broadcast is not worse than extract+insert of subvector.
https://godbolt.org/z/aPq98G6Yh
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D105390
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx-insertelt.ll
llvm/test/CodeGen/X86/avx2-masked-gather.ll
llvm/test/CodeGen/X86/avx512-insert-extract.ll
llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
llvm/test/CodeGen/X86/insertelement-shuffle.ll
llvm/test/CodeGen/X86/masked_expandload.ll
llvm/test/CodeGen/X86/masked_gather.ll
llvm/test/CodeGen/X86/masked_gather_scatter.ll
llvm/test/CodeGen/X86/masked_load.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 319c4eeb4ed9..25f27a056d46 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19190,12 +19190,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
}
}
+ unsigned NumEltsIn128 = 128 / EltSizeInBits;
+ assert(isPowerOf2_32(NumEltsIn128) &&
+ "Vectors will always have power-of-two number of elements.");
+
+ // If we are not inserting into the low 128-bit vector chunk,
+ // then prefer the broadcast+blend sequence.
+ // FIXME: relax the profitability check iff all N1 uses are insertions.
+ if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
+ ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
+ (Subtarget.hasAVX() && (EltSizeInBits >= 32) && MayFoldLoad(N1)))) {
+ SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
+ SmallVector<int, 8> BlendMask;
+ for (unsigned i = 0; i != NumElts; ++i)
+ BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+ return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
+ }
+
// Get the desired 128-bit vector chunk.
SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
// Insert the element into the desired chunk.
- unsigned NumEltsIn128 = 128 / EltSizeInBits;
- assert(isPowerOf2_32(NumEltsIn128));
// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
@@ -37977,6 +37992,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
+ // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
+ if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isNullConstant(Src.getOperand(1)) &&
+ DAG.getTargetLoweringInfo().isTypeLegal(
+ Src.getOperand(0).getValueType()))
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
+
// Share broadcast with the longest vector and extract low subvector (free).
// Ensure the same SDValue from the SDNode use is being used.
for (SDNode *User : Src->uses())
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 3f5d004841e8..1bca2df5d9ce 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -91,23 +91,35 @@ define <4 x i64> @insert_i64_firstelt_of_low_subvector(<4 x i64> %x, i64 %s) {
; 0'th element of high subvector insertion into an AVX register.
define <8 x float> @insert_f32_firstelt_of_high_subvector(<8 x float> %x, float %s) {
-; ALL-LABEL: insert_f32_firstelt_of_high_subvector:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; ALL-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
-; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: retq
+; AVX-LABEL: insert_f32_firstelt_of_high_subvector:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: insert_f32_firstelt_of_high_subvector:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
+; AVX2-NEXT: retq
%i0 = insertelement <8 x float> %x, float %s, i32 4
ret <8 x float> %i0
}
define <4 x double> @insert_f64_firstelt_of_high_subvector(<4 x double> %x, double %s) {
-; ALL-LABEL: insert_f64_firstelt_of_high_subvector:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; ALL-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: retq
+; AVX-LABEL: insert_f64_firstelt_of_high_subvector:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: insert_f64_firstelt_of_high_subvector:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
%i0 = insertelement <4 x double> %x, double %s, i32 2
ret <4 x double> %i0
}
@@ -140,9 +152,10 @@ define <16 x i16> @insert_i16_firstelt_of_high_subvector(<16 x i16> %x, i16 %s)
;
; AVX2-LABEL: insert_i16_firstelt_of_high_subvector:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpinsrw $0, %edi, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
%i0 = insertelement <16 x i16> %x, i16 %s, i32 8
ret <16 x i16> %i0
@@ -158,9 +171,9 @@ define <8 x i32> @insert_i32_firstelt_of_high_subvector(<8 x i32> %x, i32 %s) {
;
; AVX2-LABEL: insert_i32_firstelt_of_high_subvector:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpinsrd $0, %edi, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
; AVX2-NEXT: retq
%i0 = insertelement <8 x i32> %x, i32 %s, i32 4
ret <8 x i32> %i0
@@ -176,9 +189,9 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) {
;
; AVX2-LABEL: insert_i64_firstelt_of_high_subvector:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpinsrq $0, %rdi, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovq %rdi, %xmm1
+; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
%i0 = insertelement <4 x i64> %x, i64 %s, i32 2
ret <4 x i64> %i0
@@ -187,26 +200,38 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) {
; element insertion into 0'th element of both subvectors
define <8 x float> @insert_f32_firstelts(<8 x float> %x, float %s) {
-; ALL-LABEL: insert_f32_firstelts:
-; ALL: # %bb.0:
-; ALL-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; ALL-NEXT: retq
+; AVX-LABEL: insert_f32_firstelts:
+; AVX: # %bb.0:
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: insert_f32_firstelts:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
+; AVX2-NEXT: retq
%i0 = insertelement <8 x float> %x, float %s, i32 0
%i1 = insertelement <8 x float> %i0, float %s, i32 4
ret <8 x float> %i1
}
define <4 x double> @insert_f64_firstelts(<4 x double> %x, double %s) {
-; ALL-LABEL: insert_f64_firstelts:
-; ALL: # %bb.0:
-; ALL-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; ALL-NEXT: retq
+; AVX-LABEL: insert_f64_firstelts:
+; AVX: # %bb.0:
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: insert_f64_firstelts:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-NEXT: retq
%i0 = insertelement <4 x double> %x, double %s, i32 0
%i1 = insertelement <4 x double> %i0, double %s, i32 2
ret <4 x double> %i1
@@ -245,9 +270,11 @@ define <16 x i16> @insert_i16_firstelts(<16 x i16> %x, i16 %s) {
; AVX2-LABEL: insert_i16_firstelts:
; AVX2: # %bb.0:
; AVX2-NEXT: vpinsrw $0, %edi, %xmm0, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpinsrw $0, %edi, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
%i0 = insertelement <16 x i16> %x, i16 %s, i32 0
%i1 = insertelement <16 x i16> %i0, i16 %s, i32 8
@@ -266,10 +293,8 @@ define <8 x i32> @insert_i32_firstelts(<8 x i32> %x, i32 %s) {
; AVX2-LABEL: insert_i32_firstelts:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
; AVX2-NEXT: retq
%i0 = insertelement <8 x i32> %x, i32 %s, i32 0
%i1 = insertelement <8 x i32> %i0, i32 %s, i32 4
@@ -288,9 +313,10 @@ define <4 x i64> @insert_i64_firstelts(<4 x i64> %x, i64 %s) {
; AVX2-LABEL: insert_i64_firstelts:
; AVX2: # %bb.0:
; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovq %rdi, %xmm1
+; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
%i0 = insertelement <4 x i64> %x, i64 %s, i32 0
%i1 = insertelement <4 x i64> %i0, i64 %s, i32 2
@@ -300,23 +326,35 @@ define <4 x i64> @insert_i64_firstelts(<4 x i64> %x, i64 %s) {
; element insertion into two elements of high subvector
define <8 x float> @insert_f32_two_elts_of_high_subvector(<8 x float> %x, float %s) {
-; ALL-LABEL: insert_f32_two_elts_of_high_subvector:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,3]
-; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: retq
+; AVX-LABEL: insert_f32_two_elts_of_high_subvector:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: insert_f32_two_elts_of_high_subvector:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
%i0 = insertelement <8 x float> %x, float %s, i32 4
%i1 = insertelement <8 x float> %i0, float %s, i32 5
ret <8 x float> %i1
}
define <4 x double> @insert_f64_two_elts_of_high_subvector(<4 x double> %x, double %s) {
-; ALL-LABEL: insert_f64_two_elts_of_high_subvector:
-; ALL: # %bb.0:
-; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: retq
+; AVX-LABEL: insert_f64_two_elts_of_high_subvector:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: insert_f64_two_elts_of_high_subvector:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: retq
%i0 = insertelement <4 x double> %x, double %s, i32 2
%i1 = insertelement <4 x double> %i0, double %s, i32 3
ret <4 x double> %i1
@@ -354,10 +392,9 @@ define <16 x i16> @insert_i16_two_elts_of_high_subvector(<16 x i16> %x, i16 %s)
;
; AVX2-LABEL: insert_i16_two_elts_of_high_subvector:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpinsrw $0, %edi, %xmm1, %xmm1
-; AVX2-NEXT: vpinsrw $1, %edi, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
; AVX2-NEXT: retq
%i0 = insertelement <16 x i16> %x, i16 %s, i32 8
%i1 = insertelement <16 x i16> %i0, i16 %s, i32 9
@@ -375,10 +412,9 @@ define <8 x i32> @insert_i32_two_elts_of_high_subvector(<8 x i32> %x, i32 %s) {
;
; AVX2-LABEL: insert_i32_two_elts_of_high_subvector:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpinsrd $0, %edi, %xmm1, %xmm1
-; AVX2-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
%i0 = insertelement <8 x i32> %x, i32 %s, i32 4
%i1 = insertelement <8 x i32> %i0, i32 %s, i32 5
@@ -395,9 +431,9 @@ define <4 x i64> @insert_i64_two_elts_of_high_subvector(<4 x i64> %x, i64 %s) {
;
; AVX2-LABEL: insert_i64_two_elts_of_high_subvector:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1
-; AVX2-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovq %rdi, %xmm1
+; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
%i0 = insertelement <4 x i64> %x, i64 %s, i32 2
%i1 = insertelement <4 x i64> %i0, i64 %s, i32 3
diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
index 9b3635fa1c9e..0eaa034bf32b 100644
--- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll
+++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
@@ -396,17 +396,15 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
; NOGATHER-NEXT: je .LBB6_10
; NOGATHER-NEXT: # %bb.9: # %cond.load10
; NOGATHER-NEXT: vmovq %xmm0, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
-; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
; NOGATHER-NEXT: .LBB6_10: # %else11
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: je .LBB6_12
; NOGATHER-NEXT: # %bb.11: # %cond.load13
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2
-; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; NOGATHER-NEXT: .LBB6_12: # %else14
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: testb $64, %al
@@ -419,16 +417,14 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB6_13: # %cond.load16
; NOGATHER-NEXT: vmovq %xmm0, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm2, %xmm2
-; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
; NOGATHER-NEXT: testb $-128, %al
; NOGATHER-NEXT: je .LBB6_16
; NOGATHER-NEXT: .LBB6_15: # %cond.load19
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm0, %xmm0
-; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastss (%rax), %ymm0
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
entry:
@@ -503,18 +499,15 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
; NOGATHER-NEXT: je .LBB7_10
; NOGATHER-NEXT: # %bb.9: # %cond.load10
; NOGATHER-NEXT: vmovq %xmm0, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
-; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
; NOGATHER-NEXT: .LBB7_10: # %else11
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: je .LBB7_12
; NOGATHER-NEXT: # %bb.11: # %cond.load13
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; NOGATHER-NEXT: .LBB7_12: # %else14
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: testb $64, %al
@@ -527,16 +520,14 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB7_13: # %cond.load16
; NOGATHER-NEXT: vmovq %xmm0, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
; NOGATHER-NEXT: testb $-128, %al
; NOGATHER-NEXT: je .LBB7_16
; NOGATHER-NEXT: .LBB7_15: # %cond.load19
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastss (%rax), %ymm0
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
entry:
@@ -597,16 +588,14 @@ define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i6
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB8_5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2
-; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastsd (%rcx), %ymm2
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: je .LBB8_8
; NOGATHER-NEXT: .LBB8_7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
-; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastsd (%rax), %ymm0
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
entry:
@@ -667,16 +656,14 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB9_5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm0, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
-; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastsd (%rcx), %ymm2
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; NOGATHER-NEXT: testb $8, %al
; NOGATHER-NEXT: je .LBB9_8
; NOGATHER-NEXT: .LBB9_7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; NOGATHER-NEXT: vbroadcastsd (%rax), %ymm0
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 7274d8335129..4ae0d273daae 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -8,9 +8,9 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
; CHECK: ## %bb.0:
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
-; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
+; CHECK-NEXT: vbroadcastss %xmm1, %zmm1
+; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,15]
+; CHECK-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
; CHECK-NEXT: retq
%rrr = load float, float* %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
@@ -19,14 +19,23 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
}
define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
-; CHECK-LABEL: test2:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1]
-; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
-; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: test2:
+; KNL: ## %bb.0:
+; KNL-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
+; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; KNL-NEXT: movb $64, %al
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test2:
+; SKX: ## %bb.0:
+; SKX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
+; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; SKX-NEXT: movb $64, %al
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
+; SKX-NEXT: retq
%rrr = load double, double* %br
%rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
%rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -535,14 +544,23 @@ define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
}
define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
-; CHECK-LABEL: insert_v8i64:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
-; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: insert_v8i64:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: movb $8, %al
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_v8i64:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT: movb $8, %al
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
+; SKX-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <8 x i64> %x, i64 %val, i32 1
%r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
@@ -550,13 +568,22 @@ define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
}
define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
-; CHECK-LABEL: insert_v4i64:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; KNL-LABEL: insert_v4i64:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT: vmovq %rdi, %xmm1
+; KNL-NEXT: vpbroadcastq %xmm1, %ymm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_v4i64:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT: vpbroadcastq %rdi, %ymm1
+; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; SKX-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <4 x i64> %x, i64 %val, i32 1
%r2 = insertelement <4 x i64> %r1, i64 %y, i32 3
@@ -576,14 +603,23 @@ define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
}
define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
-; CHECK-LABEL: insert_v16i32:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
-; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: insert_v16i32:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: movw $32, %ax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_v16i32:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT: movw $32, %ax
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
+; SKX-NEXT: retq
%val = load i32, i32* %ptr
%r1 = insertelement <16 x i32> %x, i32 %val, i32 1
%r2 = insertelement <16 x i32> %r1, i32 %y, i32 5
@@ -591,13 +627,22 @@ define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
}
define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
-; CHECK-LABEL: insert_v8i32:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; KNL-LABEL: insert_v8i32:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT: vmovd %edi, %xmm1
+; KNL-NEXT: vpbroadcastd %xmm1, %ymm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_v8i32:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT: vpbroadcastd %edi, %ymm1
+; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; SKX-NEXT: retq
%val = load i32, i32* %ptr
%r1 = insertelement <8 x i32> %x, i32 %val, i32 1
%r2 = insertelement <8 x i32> %r1, i32 %y, i32 5
@@ -617,14 +662,24 @@ define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
}
define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
-; CHECK-LABEL: insert_v32i16:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
-; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: insert_v32i16:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; KNL-NEXT: vmovd %edi, %xmm0
+; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; KNL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_v32i16:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT: movl $512, %eax ## imm = 0x200
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
+; SKX-NEXT: retq
%val = load i16, i16* %ptr
%r1 = insertelement <32 x i16> %x, i16 %val, i32 1
%r2 = insertelement <32 x i16> %r1, i16 %y, i32 9
@@ -632,13 +687,24 @@ define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
}
define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
-; CHECK-LABEL: insert_v16i16:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; KNL-LABEL: insert_v16i16:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT: vmovd %edi, %xmm1
+; KNL-NEXT: vpbroadcastw %xmm1, %ymm1
+; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_v16i16:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT: vpbroadcastw %edi, %ymm1
+; SKX-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
+; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; SKX-NEXT: retq
%val = load i16, i16* %ptr
%r1 = insertelement <16 x i16> %x, i16 %val, i32 1
%r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
@@ -739,12 +805,20 @@ define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
}
define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
-; CHECK-LABEL: test_insert_128_v16i16:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
-; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; KNL-LABEL: test_insert_128_v16i16:
+; KNL: ## %bb.0:
+; KNL-NEXT: vmovd %edi, %xmm1
+; KNL-NEXT: vpbroadcastw %xmm1, %ymm1
+; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_insert_128_v16i16:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpbroadcastw %edi, %ymm1
+; SKX-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
+; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; SKX-NEXT: retq
%r = insertelement <16 x i16> %x, i16 %y, i32 10
ret <16 x i16> %r
}
diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
index 35374c880b72..62c18b8b2638 100644
--- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
+++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
@@ -72,12 +72,19 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
;
-; AVX512-LABEL: load_one_mask_bit_set5:
-; AVX512: ## %bb.0:
-; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_one_mask_bit_set5:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movb $-128, %al
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1}
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: load_one_mask_bit_set5:
+; SKX: ## %bb.0:
+; SKX-NEXT: movb $-128, %al
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1}
+; SKX-NEXT: retq
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
ret <8 x double> %res
}
diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll
index 000466f598bb..57ab9344c4fd 100644
--- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll
+++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll
@@ -30,19 +30,18 @@ define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounw
define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind {
; X86_AVX256-LABEL: insert_subvector_512:
; X86_AVX256: # %bb.0:
-; X86_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2
-; X86_AVX256-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; X86_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; X86_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2
+; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
+; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2
+; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
; X86_AVX256-NEXT: retl
;
; X64_AVX256-LABEL: insert_subvector_512:
; X64_AVX256: # %bb.0:
; X64_AVX256-NEXT: vmovd %edi, %xmm2
; X64_AVX256-NEXT: vpinsrd $1, %esi, %xmm2, %xmm2
-; X64_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm3
-; X64_AVX256-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
-; X64_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X64_AVX256-NEXT: vpbroadcastq %xmm2, %ymm2
+; X64_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
; X64_AVX256-NEXT: retq
;
; X86_AVX512-LABEL: insert_subvector_512:
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index dc6362d499a1..552b69748e86 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -216,16 +216,14 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, <
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je LBB1_6
; AVX1-NEXT: LBB1_5: ## %cond.load5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je LBB1_8
; AVX1-NEXT: LBB1_7: ## %cond.load9
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovhps (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: expandload_v4f64_v4i64:
@@ -259,16 +257,14 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, <
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je LBB1_6
; AVX2-NEXT: LBB1_5: ## %cond.load5
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je LBB1_8
; AVX2-NEXT: LBB1_7: ## %cond.load9
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovhpd (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: expandload_v4f64_v4i64:
@@ -405,16 +401,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je LBB2_6
; AVX1-NEXT: LBB2_5: ## %cond.load5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je LBB2_8
; AVX1-NEXT: LBB2_7: ## %cond.load9
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je LBB2_10
@@ -431,16 +425,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je LBB2_14
; AVX1-NEXT: LBB2_13: ## %cond.load21
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je LBB2_16
; AVX1-NEXT: LBB2_15: ## %cond.load25
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: expandload_v8f64_v8i1:
@@ -486,16 +478,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je LBB2_6
; AVX2-NEXT: LBB2_5: ## %cond.load5
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je LBB2_8
; AVX2-NEXT: LBB2_7: ## %cond.load9
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je LBB2_10
@@ -512,16 +502,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je LBB2_14
; AVX2-NEXT: LBB2_13: ## %cond.load21
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je LBB2_16
; AVX2-NEXT: LBB2_15: ## %cond.load25
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: expandload_v8f64_v8i1:
@@ -777,16 +765,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je LBB3_6
; AVX1-NEXT: LBB3_5: ## %cond.load5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je LBB3_8
; AVX1-NEXT: LBB3_7: ## %cond.load9
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je LBB3_10
@@ -803,16 +789,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je LBB3_14
; AVX1-NEXT: LBB3_13: ## %cond.load21
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je LBB3_16
; AVX1-NEXT: LBB3_15: ## %cond.load25
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $256, %eax ## imm = 0x100
; AVX1-NEXT: je LBB3_18
@@ -829,16 +813,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
; AVX1-NEXT: je LBB3_22
; AVX1-NEXT: LBB3_21: ## %cond.load37
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
; AVX1-NEXT: je LBB3_24
; AVX1-NEXT: LBB3_23: ## %cond.load41
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX1-NEXT: je LBB3_26
@@ -855,16 +837,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX1-NEXT: je LBB3_30
; AVX1-NEXT: LBB3_29: ## %cond.load53
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX1-NEXT: je LBB3_32
; AVX1-NEXT: LBB3_31: ## %cond.load57
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: expandload_v16f64_v16i32:
@@ -939,16 +919,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je LBB3_6
; AVX2-NEXT: LBB3_5: ## %cond.load5
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je LBB3_8
; AVX2-NEXT: LBB3_7: ## %cond.load9
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je LBB3_10
@@ -965,16 +943,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je LBB3_14
; AVX2-NEXT: LBB3_13: ## %cond.load21
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je LBB3_16
; AVX2-NEXT: LBB3_15: ## %cond.load25
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $256, %eax ## imm = 0x100
; AVX2-NEXT: je LBB3_18
@@ -991,16 +967,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
; AVX2-NEXT: je LBB3_22
; AVX2-NEXT: LBB3_21: ## %cond.load37
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
; AVX2-NEXT: je LBB3_24
; AVX2-NEXT: LBB3_23: ## %cond.load41
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX2-NEXT: je LBB3_26
@@ -1017,16 +991,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX2-NEXT: je LBB3_30
; AVX2-NEXT: LBB3_29: ## %cond.load53
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX2-NEXT: je LBB3_32
; AVX2-NEXT: LBB3_31: ## %cond.load57
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: expandload_v16f64_v16i32:
@@ -2193,31 +2165,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je LBB8_10
; AVX1-NEXT: LBB8_9: ## %cond.load13
-; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je LBB8_12
; AVX1-NEXT: LBB8_11: ## %cond.load17
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je LBB8_14
; AVX1-NEXT: LBB8_13: ## %cond.load21
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je LBB8_16
; AVX1-NEXT: LBB8_15: ## %cond.load25
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $256, %eax ## imm = 0x100
; AVX1-NEXT: je LBB8_18
@@ -2246,31 +2213,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX1-NEXT: je LBB8_26
; AVX1-NEXT: LBB8_25: ## %cond.load45
-; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX1-NEXT: je LBB8_28
; AVX1-NEXT: LBB8_27: ## %cond.load49
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX1-NEXT: je LBB8_30
; AVX1-NEXT: LBB8_29: ## %cond.load53
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX1-NEXT: je LBB8_32
; AVX1-NEXT: LBB8_31: ## %cond.load57
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX1-NEXT: je LBB8_34
@@ -2299,31 +2261,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX1-NEXT: je LBB8_42
; AVX1-NEXT: LBB8_41: ## %cond.load77
-; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000
; AVX1-NEXT: je LBB8_44
; AVX1-NEXT: LBB8_43: ## %cond.load81
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX1-NEXT: je LBB8_46
; AVX1-NEXT: LBB8_45: ## %cond.load85
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX1-NEXT: je LBB8_48
; AVX1-NEXT: LBB8_47: ## %cond.load89
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX1-NEXT: je LBB8_50
@@ -2352,31 +2309,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX1-NEXT: je LBB8_58
; AVX1-NEXT: LBB8_57: ## %cond.load109
-; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000
; AVX1-NEXT: je LBB8_60
; AVX1-NEXT: LBB8_59: ## %cond.load113
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX1-NEXT: je LBB8_62
; AVX1-NEXT: LBB8_61: ## %cond.load117
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX1-NEXT: je LBB8_64
; AVX1-NEXT: LBB8_63: ## %cond.load121
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: expandload_v32f32_v32i32:
@@ -2515,31 +2467,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je LBB8_10
; AVX2-NEXT: LBB8_9: ## %cond.load13
-; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je LBB8_12
; AVX2-NEXT: LBB8_11: ## %cond.load17
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je LBB8_14
; AVX2-NEXT: LBB8_13: ## %cond.load21
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je LBB8_16
; AVX2-NEXT: LBB8_15: ## %cond.load25
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $256, %eax ## imm = 0x100
; AVX2-NEXT: je LBB8_18
@@ -2568,31 +2515,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX2-NEXT: je LBB8_26
; AVX2-NEXT: LBB8_25: ## %cond.load45
-; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX2-NEXT: je LBB8_28
; AVX2-NEXT: LBB8_27: ## %cond.load49
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX2-NEXT: je LBB8_30
; AVX2-NEXT: LBB8_29: ## %cond.load53
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX2-NEXT: je LBB8_32
; AVX2-NEXT: LBB8_31: ## %cond.load57
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX2-NEXT: je LBB8_34
@@ -2621,31 +2563,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX2-NEXT: je LBB8_42
; AVX2-NEXT: LBB8_41: ## %cond.load77
-; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000
; AVX2-NEXT: je LBB8_44
; AVX2-NEXT: LBB8_43: ## %cond.load81
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX2-NEXT: je LBB8_46
; AVX2-NEXT: LBB8_45: ## %cond.load85
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX2-NEXT: je LBB8_48
; AVX2-NEXT: LBB8_47: ## %cond.load89
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX2-NEXT: je LBB8_50
@@ -2674,31 +2611,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX2-NEXT: je LBB8_58
; AVX2-NEXT: LBB8_57: ## %cond.load109
-; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000
; AVX2-NEXT: je LBB8_60
; AVX2-NEXT: LBB8_59: ## %cond.load113
-; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX2-NEXT: je LBB8_62
; AVX2-NEXT: LBB8_61: ## %cond.load117
-; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX2-NEXT: je LBB8_64
; AVX2-NEXT: LBB8_63: ## %cond.load121
-; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX2-NEXT: retq
;
; AVX512-LABEL: expandload_v32f32_v32i32:
diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index a1ff2bf08782..2f00b80bb76b 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -1359,11 +1359,10 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB4_16
; AVX1-NEXT: .LBB4_15: # %cond.load19
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpinsrd $3, c+12(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
; AVX1-NEXT: .LBB4_16: # %else20
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
@@ -1393,11 +1392,10 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB4_32
; AVX1-NEXT: .LBB4_31: # %cond.load58
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX1-NEXT: .LBB4_32: # %else61
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -1418,9 +1416,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB4_42
; AVX1-NEXT: .LBB4_41: # %cond.load84
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpinsrd $0, c+28(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
; AVX1-NEXT: .LBB4_42: # %else87
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
@@ -1428,25 +1425,22 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB4_44
; AVX1-NEXT: # %bb.43: # %cond.load89
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
; AVX1-NEXT: .LBB4_44: # %else92
; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB4_46
; AVX1-NEXT: # %bb.45: # %cond.load94
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7]
; AVX1-NEXT: .LBB4_46: # %else97
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB4_48
; AVX1-NEXT: # %bb.47: # %cond.load99
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX1-NEXT: .LBB4_48: # %else102
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
@@ -1474,21 +1468,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB4_10
; AVX1-NEXT: .LBB4_9: # %cond.load10
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpinsrd $0, c+12(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6,7]
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB4_12
; AVX1-NEXT: .LBB4_11: # %cond.load13
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpinsrd $1, c+12(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB4_14
; AVX1-NEXT: .LBB4_13: # %cond.load16
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpinsrd $2, c+12(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7]
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: jne .LBB4_15
; AVX1-NEXT: jmp .LBB4_16
@@ -1512,21 +1503,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB4_26
; AVX1-NEXT: .LBB4_25: # %cond.load43
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpinsrd $0, c+28(%rip), %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB4_28
; AVX1-NEXT: .LBB4_27: # %cond.load48
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB4_30
; AVX1-NEXT: .LBB4_29: # %cond.load53
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: jne .LBB4_31
; AVX1-NEXT: jmp .LBB4_32
@@ -1581,9 +1569,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je .LBB4_16
; AVX2-NEXT: .LBB4_15: # %cond.load19
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpinsrd $3, c+12(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-NEXT: .LBB4_16: # %else20
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
@@ -1613,9 +1600,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je .LBB4_32
; AVX2-NEXT: .LBB4_31: # %cond.load58
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpinsrd $3, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
; AVX2-NEXT: .LBB4_32: # %else61
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm0, %ymm0
@@ -1642,17 +1628,15 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je .LBB4_46
; AVX2-NEXT: .LBB4_45: # %cond.load94
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7]
; AVX2-NEXT: .LBB4_46: # %else97
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je .LBB4_48
; AVX2-NEXT: # %bb.47: # %cond.load99
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX2-NEXT: .LBB4_48: # %else102
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@@ -1676,21 +1660,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je .LBB4_10
; AVX2-NEXT: .LBB4_9: # %cond.load10
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpinsrd $0, c+12(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je .LBB4_12
; AVX2-NEXT: .LBB4_11: # %cond.load13
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpinsrd $1, c+12(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je .LBB4_14
; AVX2-NEXT: .LBB4_13: # %cond.load16
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpinsrd $2, c+12(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: jne .LBB4_15
; AVX2-NEXT: jmp .LBB4_16
@@ -1714,21 +1695,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je .LBB4_26
; AVX2-NEXT: .LBB4_25: # %cond.load43
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpinsrd $0, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7]
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je .LBB4_28
; AVX2-NEXT: .LBB4_27: # %cond.load48
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je .LBB4_30
; AVX2-NEXT: .LBB4_29: # %cond.load53
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: jne .LBB4_31
; AVX2-NEXT: jmp .LBB4_32
@@ -1752,15 +1730,13 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je .LBB4_42
; AVX2-NEXT: .LBB4_41: # %cond.load84
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpinsrd $0, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7]
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je .LBB4_44
; AVX2-NEXT: .LBB4_43: # %cond.load89
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: jne .LBB4_45
; AVX2-NEXT: jmp .LBB4_46
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 771850f1f2ec..2a961299d177 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -965,16 +965,14 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB15_5: # %cond.load4
; KNL_64-NEXT: vmovq %xmm0, %rcx
-; KNL_64-NEXT: vextracti128 $1, %ymm2, %xmm1
-; KNL_64-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; KNL_64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
+; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm1
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
; KNL_64-NEXT: testb $8, %al
; KNL_64-NEXT: je .LBB15_8
; KNL_64-NEXT: .LBB15_7: # %cond.load7
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
-; KNL_64-NEXT: vextracti128 $1, %ymm2, %xmm0
-; KNL_64-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2
+; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
; KNL_64-NEXT: retq
;
@@ -1014,16 +1012,14 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
; KNL_32-NEXT: je .LBB15_6
; KNL_32-NEXT: .LBB15_5: # %cond.load4
; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx
-; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm1
-; KNL_32-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; KNL_32-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
+; KNL_32-NEXT: vpbroadcastq (%ecx), %ymm1
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
; KNL_32-NEXT: testb $8, %al
; KNL_32-NEXT: je .LBB15_8
; KNL_32-NEXT: .LBB15_7: # %cond.load7
; KNL_32-NEXT: vpextrd $3, %xmm0, %eax
-; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm0
-; KNL_32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; KNL_32-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2
+; KNL_32-NEXT: vpbroadcastq (%eax), %ymm0
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
; KNL_32-NEXT: retl
;
@@ -3220,17 +3216,15 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_64-NEXT: je .LBB42_6
; KNL_64-NEXT: # %bb.5: # %cond.load4
; KNL_64-NEXT: vmovq %xmm2, %rcx
-; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm3
-; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm3, %xmm3
-; KNL_64-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm3
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
; KNL_64-NEXT: .LBB42_6: # %else5
; KNL_64-NEXT: testb $8, %al
; KNL_64-NEXT: je .LBB42_8
; KNL_64-NEXT: # %bb.7: # %cond.load7
; KNL_64-NEXT: vpextrq $1, %xmm2, %rax
-; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm3
-; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm3, %xmm3
-; KNL_64-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; KNL_64-NEXT: vpbroadcastq (%rax), %ymm3
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; KNL_64-NEXT: .LBB42_8: # %else8
; KNL_64-NEXT: kmovw %k0, %eax
; KNL_64-NEXT: testb $1, %al
@@ -3247,9 +3241,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_64-NEXT: je .LBB42_16
; KNL_64-NEXT: .LBB42_15: # %cond.load29
; KNL_64-NEXT: vpextrq $1, %xmm2, %rax
-; KNL_64-NEXT: vextracti128 $1, %ymm3, %xmm4
-; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm4, %xmm4
-; KNL_64-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL_64-NEXT: vpbroadcastq (%rax), %ymm4
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
; KNL_64-NEXT: .LBB42_16: # %else33
; KNL_64-NEXT: kmovw %k0, %eax
; KNL_64-NEXT: testb $1, %al
@@ -3266,9 +3259,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_64-NEXT: je .LBB42_24
; KNL_64-NEXT: .LBB42_23: # %cond.load54
; KNL_64-NEXT: vpextrq $1, %xmm2, %rax
-; KNL_64-NEXT: vextracti128 $1, %ymm4, %xmm0
-; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
-; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm4
+; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7]
; KNL_64-NEXT: .LBB42_24: # %else58
; KNL_64-NEXT: vpaddq %ymm3, %ymm1, %ymm0
; KNL_64-NEXT: vpaddq %ymm4, %ymm0, %ymm0
@@ -3286,9 +3278,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_64-NEXT: je .LBB42_14
; KNL_64-NEXT: .LBB42_13: # %cond.load23
; KNL_64-NEXT: vmovq %xmm2, %rcx
-; KNL_64-NEXT: vextracti128 $1, %ymm3, %xmm4
-; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm4, %xmm4
-; KNL_64-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm4
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
; KNL_64-NEXT: testb $8, %al
; KNL_64-NEXT: jne .LBB42_15
; KNL_64-NEXT: jmp .LBB42_16
@@ -3305,9 +3296,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_64-NEXT: je .LBB42_22
; KNL_64-NEXT: .LBB42_21: # %cond.load48
; KNL_64-NEXT: vmovq %xmm2, %rcx
-; KNL_64-NEXT: vextracti128 $1, %ymm4, %xmm0
-; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm0, %xmm0
-; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm4
+; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm0
+; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7]
; KNL_64-NEXT: testb $8, %al
; KNL_64-NEXT: jne .LBB42_23
; KNL_64-NEXT: jmp .LBB42_24
@@ -3347,19 +3337,19 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_32-NEXT: vpextrd $2, %xmm0, %edx
; KNL_32-NEXT: je .LBB42_6
; KNL_32-NEXT: # %bb.5: # %cond.load4
-; KNL_32-NEXT: vextracti128 $1, %ymm1, %xmm2
-; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm2, %xmm2
-; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm2, %xmm2
-; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
+; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; KNL_32-NEXT: .LBB42_6: # %else5
; KNL_32-NEXT: testb $8, %bl
; KNL_32-NEXT: vpextrd $3, %xmm0, %esi
; KNL_32-NEXT: je .LBB42_8
; KNL_32-NEXT: # %bb.7: # %cond.load7
-; KNL_32-NEXT: vextracti128 $1, %ymm1, %xmm0
-; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0
-; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm0, %xmm0
-; KNL_32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; KNL_32-NEXT: vpbroadcastd (%esi), %ymm0
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
+; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm1
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; KNL_32-NEXT: .LBB42_8: # %else8
; KNL_32-NEXT: kmovw %k0, %ebx
; KNL_32-NEXT: testb $1, %bl
@@ -3375,10 +3365,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_32-NEXT: testb $8, %bl
; KNL_32-NEXT: je .LBB42_16
; KNL_32-NEXT: .LBB42_15: # %cond.load29
-; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm2
-; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm2, %xmm2
-; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm2, %xmm2
-; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; KNL_32-NEXT: vpbroadcastd (%esi), %ymm2
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
+; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm2
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; KNL_32-NEXT: .LBB42_16: # %else33
; KNL_32-NEXT: kmovw %k0, %ebx
; KNL_32-NEXT: testb $1, %bl
@@ -3394,10 +3384,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_32-NEXT: testb $8, %bl
; KNL_32-NEXT: je .LBB42_24
; KNL_32-NEXT: .LBB42_23: # %cond.load54
-; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm3
-; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm3, %xmm3
-; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm3, %xmm3
-; KNL_32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL_32-NEXT: vpbroadcastd (%esi), %ymm3
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
+; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm3
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
; KNL_32-NEXT: .LBB42_24: # %else58
; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_32-NEXT: vpaddq %ymm2, %ymm0, %ymm0
@@ -3419,10 +3409,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_32-NEXT: testb $4, %bl
; KNL_32-NEXT: je .LBB42_14
; KNL_32-NEXT: .LBB42_13: # %cond.load23
-; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm2
-; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm2, %xmm2
-; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm2, %xmm2
-; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
+; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
; KNL_32-NEXT: testb $8, %bl
; KNL_32-NEXT: jne .LBB42_15
; KNL_32-NEXT: jmp .LBB42_16
@@ -3437,10 +3427,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_32-NEXT: testb $4, %bl
; KNL_32-NEXT: je .LBB42_22
; KNL_32-NEXT: .LBB42_21: # %cond.load48
-; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm3
-; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm3, %xmm3
-; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm3, %xmm3
-; KNL_32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL_32-NEXT: vpbroadcastd (%edx), %ymm3
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7]
+; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm3
+; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; KNL_32-NEXT: testb $8, %bl
; KNL_32-NEXT: jne .LBB42_23
; KNL_32-NEXT: jmp .LBB42_24
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 2e55a372ab57..e8e45a156736 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -3463,51 +3463,51 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1
; AVX2-NEXT: testl $256, %eax ## imm = 0x100
; AVX2-NEXT: je LBB22_18
; AVX2-NEXT: LBB22_17: ## %cond.load22
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw 16(%rdi), %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: testl $512, %eax ## imm = 0x200
; AVX2-NEXT: je LBB22_20
; AVX2-NEXT: LBB22_19: ## %cond.load25
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw 18(%rdi), %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
; AVX2-NEXT: je LBB22_22
; AVX2-NEXT: LBB22_21: ## %cond.load28
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw 20(%rdi), %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
; AVX2-NEXT: je LBB22_24
; AVX2-NEXT: LBB22_23: ## %cond.load31
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw 22(%rdi), %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX2-NEXT: je LBB22_26
; AVX2-NEXT: LBB22_25: ## %cond.load34
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw 24(%rdi), %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX2-NEXT: je LBB22_28
; AVX2-NEXT: LBB22_27: ## %cond.load37
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw 26(%rdi), %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX2-NEXT: je LBB22_30
; AVX2-NEXT: LBB22_29: ## %cond.load40
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw 28(%rdi), %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX2-NEXT: je LBB22_32
; AVX2-NEXT: LBB22_31: ## %cond.load43
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw 30(%rdi), %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqa %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -3609,51 +3609,51 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1
; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
; AVX512F-NEXT: je LBB22_18
; AVX512F-NEXT: LBB22_17: ## %cond.load22
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastw 16(%rdi), %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
; AVX512F-NEXT: je LBB22_20
; AVX512F-NEXT: LBB22_19: ## %cond.load25
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastw 18(%rdi), %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512F-NEXT: je LBB22_22
; AVX512F-NEXT: LBB22_21: ## %cond.load28
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastw 20(%rdi), %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512F-NEXT: je LBB22_24
; AVX512F-NEXT: LBB22_23: ## %cond.load31
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastw 22(%rdi), %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512F-NEXT: je LBB22_26
; AVX512F-NEXT: LBB22_25: ## %cond.load34
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastw 24(%rdi), %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512F-NEXT: je LBB22_28
; AVX512F-NEXT: LBB22_27: ## %cond.load37
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastw 26(%rdi), %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512F-NEXT: je LBB22_30
; AVX512F-NEXT: LBB22_29: ## %cond.load40
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastw 28(%rdi), %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512F-NEXT: je LBB22_32
; AVX512F-NEXT: LBB22_31: ## %cond.load43
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastw 30(%rdi), %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
;
@@ -3755,51 +3755,51 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1
; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
; AVX512VLDQ-NEXT: je LBB22_18
; AVX512VLDQ-NEXT: LBB22_17: ## %cond.load22
-; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT: vpbroadcastw 16(%rdi), %ymm0
+; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
; AVX512VLDQ-NEXT: je LBB22_20
; AVX512VLDQ-NEXT: LBB22_19: ## %cond.load25
-; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT: vpbroadcastw 18(%rdi), %ymm0
+; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
+; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
; AVX512VLDQ-NEXT: je LBB22_22
; AVX512VLDQ-NEXT: LBB22_21: ## %cond.load28
-; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT: vpbroadcastw 20(%rdi), %ymm0
+; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
; AVX512VLDQ-NEXT: je LBB22_24
; AVX512VLDQ-NEXT: LBB22_23: ## %cond.load31
-; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT: vpbroadcastw 22(%rdi), %ymm0
+; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
+; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX512VLDQ-NEXT: je LBB22_26
; AVX512VLDQ-NEXT: LBB22_25: ## %cond.load34
-; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT: vpbroadcastw 24(%rdi), %ymm0
+; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX512VLDQ-NEXT: je LBB22_28
; AVX512VLDQ-NEXT: LBB22_27: ## %cond.load37
-; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT: vpbroadcastw 26(%rdi), %ymm0
+; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
+; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX512VLDQ-NEXT: je LBB22_30
; AVX512VLDQ-NEXT: LBB22_29: ## %cond.load40
-; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT: vpbroadcastw 28(%rdi), %ymm0
+; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
+; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX512VLDQ-NEXT: je LBB22_32
; AVX512VLDQ-NEXT: LBB22_31: ## %cond.load43
-; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT: vpbroadcastw 30(%rdi), %ymm0
+; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLDQ-NEXT: retq
;
@@ -7084,33 +7084,17 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm1
; SSE42-NEXT: retq
;
-; AVX1-LABEL: load_one_mask_bit_set3:
-; AVX1: ## %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_one_mask_bit_set3:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_one_mask_bit_set3:
-; AVX512: ## %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: load_one_mask_bit_set3:
+; AVX: ## %bb.0:
+; AVX-NEXT: vbroadcastsd 16(%rdi), %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT: retq
;
; X86-AVX512-LABEL: load_one_mask_bit_set3:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %ymm1
+; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; X86-AVX512-NEXT: retl
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
ret <4 x i64> %res
@@ -7126,17 +7110,15 @@ define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %v
;
; AVX-LABEL: load_one_mask_bit_set4:
; AVX: ## %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vbroadcastsd 24(%rdi), %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: retq
;
; X86-AVX512-LABEL: load_one_mask_bit_set4:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX512-NEXT: vbroadcastsd 24(%eax), %ymm1
+; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; X86-AVX512-NEXT: retl
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
ret <4 x double> %res
@@ -7152,24 +7134,37 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v
;
; AVX1OR2-LABEL: load_one_mask_bit_set5:
; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1OR2-NEXT: vbroadcastsd 56(%rdi), %ymm2
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX1OR2-NEXT: retq
;
-; AVX512-LABEL: load_one_mask_bit_set5:
-; AVX512: ## %bb.0:
-; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_one_mask_bit_set5:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movb $-128, %al
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1}
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: load_one_mask_bit_set5:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movb $-128, %al
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: load_one_mask_bit_set5:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movb $-128, %al
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1}
+; AVX512VLBW-NEXT: retq
;
; X86-AVX512-LABEL: load_one_mask_bit_set5:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; X86-AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; X86-AVX512-NEXT: movb $-128, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vbroadcastsd 56(%eax), %zmm0 {%k1}
; X86-AVX512-NEXT: retl
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
ret <8 x double> %res
@@ -7235,43 +7230,43 @@ define <16 x i64> @load_one_mask_bit_set6(<16 x i64>* %addr, <16 x i64> %val) {
;
; AVX512F-LABEL: load_one_mask_bit_set6:
; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movb $4, %al
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1}
; AVX512F-NEXT: movb $36, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1}
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2
-; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: load_one_mask_bit_set6:
; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movb $4, %al
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1}
; AVX512VLDQ-NEXT: movb $36, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1}
-; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VLDQ-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2
-; AVX512VLDQ-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: load_one_mask_bit_set6:
; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movb $4, %al
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1}
; AVX512VLBW-NEXT: movb $36, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1}
-; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VLBW-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
;
; X86-AVX512-LABEL: load_one_mask_bit_set6:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $4, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %zmm0 {%k1}
; X86-AVX512-NEXT: movb $36, %cl
; X86-AVX512-NEXT: kmovd %ecx, %k1
; X86-AVX512-NEXT: vmovdqu64 64(%eax), %zmm1 {%k1}
-; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
-; X86-AVX512-NEXT: vinsertf32x4 $1, %xmm2, %zmm0, %zmm0
; X86-AVX512-NEXT: retl
%res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %addr, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>, <16 x i64> %val)
ret <16 x i64> %res
More information about the llvm-commits
mailing list