[llvm] b16ce8f - [X86] getFauxShuffleMask - match 256-bit CONCAT(SUB0, SUB1) 64-bit elt patterns as well as 512-bit (#127392)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 17 00:20:29 PST 2025
Author: Simon Pilgrim
Date: 2025-02-17T08:20:25Z
New Revision: b16ce8fc24f32aa0614562de0a2d0916118398fb
URL: https://github.com/llvm/llvm-project/commit/b16ce8fc24f32aa0614562de0a2d0916118398fb
DIFF: https://github.com/llvm/llvm-project/commit/b16ce8fc24f32aa0614562de0a2d0916118398fb.diff
LOG: [X86] getFauxShuffleMask - match 256-bit CONCAT(SUB0, SUB1) 64-bit elt patterns as well as 512-bit (#127392)
The 512-bit filter was to prevent AVX1/2 regressions, but most of that is now handled by canonicalizeShuffleWithOp
Ideally we need to support smaller element widths as well.
Noticed while triaging #116931
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
llvm/test/CodeGen/X86/widen_fadd.ll
llvm/test/CodeGen/X86/widen_fdiv.ll
llvm/test/CodeGen/X86/widen_fmul.ll
llvm/test/CodeGen/X86/widen_fsub.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9592137b34842..21b08a4a93fc7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6130,11 +6130,9 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
return true;
}
// Handle CONCAT(SUB0, SUB1).
- // Limit this to vXi64 512-bit vector cases to make the most of AVX512
- // cross lane shuffles.
+ // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
- NumBitsPerElt == 64 && NumSizeInBits == 512 &&
- Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ NumBitsPerElt == 64 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
Src.getOperand(0).isUndef() &&
Src.getOperand(1).getValueType() == SubVT &&
Src.getConstantOperandVal(2) == 0) {
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 0beb304a5673d..4a2e7d55d3e88 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -211,10 +211,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovq %xmm1, 16(%rcx)
@@ -228,10 +228,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -245,10 +245,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -262,10 +262,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmovq %xmm1, 16(%rcx)
@@ -279,10 +279,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -296,10 +296,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vmovq %xmm1, 16(%rcx)
@@ -313,10 +313,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -330,12 +330,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx)
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512BW-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -345,12 +344,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-FCP-NEXT: vmovq %xmm1, 16(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -360,12 +358,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-NEXT: vmovq %xmm1, 16(%rcx)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -375,12 +372,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 16(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index 704c92924abfb..71eb606a8665d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -217,12 +217,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: vmovdqa %ymm0, (%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -235,12 +235,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-FP-NEXT: vmovdqa %ymm0, (%r8)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
@@ -269,12 +269,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX512-NEXT: vmovdqa %ymm0, (%r8)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -287,10 +287,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%r8)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -303,12 +302,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX512DQ-NEXT: vmovdqa %ymm0, (%r8)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -321,10 +320,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%r8)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -337,10 +335,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
-; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vmovdqa %ymm0, (%r8)
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23]
+; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BW-NEXT: vmovdqa %ymm2, (%r8)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -352,10 +349,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r8)
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23]
+; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%r8)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -367,10 +363,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
-; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%r8)
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23]
+; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%r8)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -382,10 +377,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%r8)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index f135b2f1577ec..351d98540c2a5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -584,22 +584,22 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
-; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7]
+; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT: vpor %ymm5, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[24,25,20,21,22,23,20,21,28,29]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3],zero,zero,ymm4[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
@@ -667,28 +667,28 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
-; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7]
+; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,8,9],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,2,3],zero,zero,ymm4[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
; AVX512-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
@@ -750,28 +750,28 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
-; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7]
+; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,8,9],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,2,3],zero,zero,ymm4[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512DQ-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
; AVX512DQ-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
index ccd2d58702de0..4be6ccd2e3575 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
@@ -456,34 +456,34 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,4]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,4]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,1,1,3,4,5,5,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,3,7,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[3,1,2,3,7,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,1,2,3,7,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX2-NEXT: vmovdqa %ymm0, 32(%rax)
-; AVX2-NEXT: vmovdqa %ymm2, (%rax)
+; AVX2-NEXT: vmovdqa %ymm1, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -504,30 +504,30 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,8,9,0,1,8,9,16,17,18,19,20,21,22,23,18,19,26,27,18,19,26,27]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm3
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,8,9,10,11,12,13,14,15,18,19,26,27,18,19,26,27,24,25,26,27,28,29,30,31]
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm5
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm6, %ymm3
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,0,1,8,9,0,1,8,9,16,17,18,19,20,21,22,23,18,19,26,27,18,19,26,27]
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm6
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm5, %ymm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,8,9,10,11,12,13,14,15,18,19,26,27,18,19,26,27,24,25,26,27,28,29,30,31]
+; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm6
+; AVX2-FP-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,4,5,12,13,4,5,12,13,16,17,18,19,20,21,22,23,22,23,30,31,22,23,30,31]
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm5, %ymm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,12,13,4,5,12,13,8,9,10,11,12,13,14,15,22,23,30,31,22,23,30,31,24,25,26,27,28,29,30,31]
; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm6, %ymm3
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax)
-; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax)
+; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
@@ -588,33 +588,33 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -676,33 +676,33 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index f9228707182f7..a2ebecd3e0f87 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -95,121 +95,113 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512-LABEL: store_i32_stride3_vf2:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vmovlps %xmm1, 16(%rcx)
-; AVX512-NEXT: vmovaps %xmm0, (%rcx)
+; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i32_stride3_vf2:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vmovlps %xmm1, 16(%rcx)
-; AVX512-FCP-NEXT: vmovaps %xmm0, (%rcx)
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512-FCP-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i32_stride3_vf2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512DQ-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vmovlps %xmm1, 16(%rcx)
-; AVX512DQ-NEXT: vmovaps %xmm0, (%rcx)
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512DQ-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i32_stride3_vf2:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512DQ-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT: vmovlps %xmm1, 16(%rcx)
-; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i32_stride3_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512BW-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovlps %xmm1, 16(%rcx)
-; AVX512BW-NEXT: vmovaps %xmm0, (%rcx)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512BW-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: store_i32_stride3_vf2:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512BW-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-FCP-NEXT: vmovlps %xmm1, 16(%rcx)
-; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: store_i32_stride3_vf2:
; AVX512DQ-BW: # %bb.0:
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512DQ-BW-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-NEXT: vmovlps %xmm1, 16(%rcx)
-; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: store_i32_stride3_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, 16(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 16(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
index 22040e0cdb791..c58352e503ae1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
@@ -110,16 +110,15 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512-FCP-LABEL: store_i32_stride4_vf2:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vmovaps %ymm0, (%r8)
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -140,16 +139,15 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-FCP-LABEL: store_i32_stride4_vf2:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%r8)
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -170,16 +168,15 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512BW-FCP-LABEL: store_i32_stride4_vf2:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512BW-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%r8)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%r8)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -200,16 +197,15 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-BW-FCP-LABEL: store_i32_stride4_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%r8)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index 3d26171054f2e..ba1621c67f480 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -384,10 +384,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovq %xmm1, 16(%rcx)
@@ -401,10 +401,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -418,10 +418,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -435,10 +435,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmovq %xmm1, 16(%rcx)
@@ -452,10 +452,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -469,10 +469,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vmovq %xmm1, 16(%rcx)
@@ -486,10 +486,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -503,10 +503,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx)
@@ -520,10 +520,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -537,10 +537,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-BW-NEXT: vmovq %xmm1, 16(%rcx)
@@ -554,10 +554,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 16(%rcx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index ab968b91153a9..be83db26aa7ed 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -912,24 +912,24 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpor %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5
; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero
-; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
+; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
@@ -954,24 +954,24 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-FP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpor %ymm5, %ymm3, %ymm3
+; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FP-NEXT: vpor %ymm5, %ymm6, %ymm5
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero
-; AVX2-FP-NEXT: vpor %ymm4, %ymm2, %ymm2
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
+; AVX2-FP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero
; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0
@@ -996,22 +996,22 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7]
; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,4,8],zero,zero,zero,zero,ymm5[1,5,9],zero,zero,zero,zero,ymm5[2,6,18],zero,zero,zero,zero,ymm5[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT: vpor %ymm5, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
@@ -1038,18 +1038,18 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,5,13],zero,zero,ymm2[u,u,u,6,14],zero,zero,ymm2[u,u,u],zero,zero,ymm2[23,31,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u],zero,zero,ymm2[5,13,u,u,u],zero,zero,ymm2[6,14,u,u,u,23,31],zero,zero,ymm2[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,5,13],zero,zero,ymm4[u,u,u,6,14],zero,zero,ymm4[u,u,u],zero,zero,ymm4[23,31,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
; AVX512-NEXT: vporq %zmm2, %zmm3, %zmm2
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u]
@@ -1079,28 +1079,28 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,zero,zero,ymm3[u,u,u,19,27],zero,zero,ymm3[u,u,u,20,28],zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7]
; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
; AVX512-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
@@ -1121,18 +1121,18 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,5,13],zero,zero,ymm2[u,u,u,6,14],zero,zero,ymm2[u,u,u],zero,zero,ymm2[23,31,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u],zero,zero,ymm2[5,13,u,u,u],zero,zero,ymm2[6,14,u,u,u,23,31],zero,zero,ymm2[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,5,13],zero,zero,ymm4[u,u,u,6,14],zero,zero,ymm4[u,u,u],zero,zero,ymm4[23,31,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
; AVX512DQ-NEXT: vporq %zmm2, %zmm3, %zmm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u]
@@ -1162,28 +1162,28 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,zero,zero,ymm3[u,u,u,19,27],zero,zero,ymm3[u,u,u,20,28],zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7]
; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512DQ-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
; AVX512DQ-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
@@ -1241,33 +1241,33 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero
-; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512BW-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7]
+; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0]
+; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,4,8],zero,zero,zero,zero,ymm5[1,5,9],zero,zero,zero,zero,ymm5[2,6,18],zero,zero,zero,zero,ymm5[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
-; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-FCP-NEXT: vmovq %xmm2, 48(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-FCP-NEXT: vmovq %xmm1, 48(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 32(%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -1320,33 +1320,33 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,4,8],zero,zero,zero,zero,ymm5[1,5,9],zero,zero,zero,zero,ymm5[2,6,18],zero,zero,zero,zero,ymm5[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, 48(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 48(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 32(%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 3a70df7617f18..675412defbb24 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -869,26 +869,26 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,zero,ymm1[19,27]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[18,26],zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero
-; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27]
+; AVX2-NEXT: vpor %ymm3, %ymm6, %ymm3
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,8],zero,zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero
+; AVX2-NEXT: vpor %ymm1, %ymm6, %ymm1
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm2[4,12],zero,zero,zero,zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,ymm2[22,30],zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31]
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[4,12],zero,zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[22,30],zero,zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero
+; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, 32(%rax)
-; AVX2-NEXT: vmovdqa %ymm2, (%rax)
+; AVX2-NEXT: vmovdqa %ymm1, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -909,26 +909,26 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,zero,ymm1[19,27]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[18,26],zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero
-; AVX2-FP-NEXT: vpor %ymm4, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpor %ymm6, %ymm4, %ymm4
-; AVX2-FP-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27]
+; AVX2-FP-NEXT: vpor %ymm3, %ymm6, %ymm3
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,8],zero,zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero
+; AVX2-FP-NEXT: vpor %ymm1, %ymm6, %ymm1
; AVX2-FP-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm2[4,12],zero,zero,zero,zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,ymm2[22,30],zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31]
+; AVX2-FP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[4,12],zero,zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[22,30],zero,zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero
+; AVX2-FP-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX2-FP-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax)
-; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax)
+; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
@@ -989,29 +989,29 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31]
-; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm3
-; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27]
-; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm4
-; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128]
-; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm4
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128]
-; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512-NEXT: vpord %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm6
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3],ymm1[4],ymm6[5],ymm1[6],ymm6[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128]
+; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31]
+; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm2
+; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27]
+; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm3
+; AVX512-NEXT: vpshufb %ymm2, %ymm4, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1073,29 +1073,29 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31]
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm4
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm4
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpord %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm6
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3],ymm1[4],ymm6[5],ymm1[6],ymm6[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm2
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27]
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm3
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1198,18 +1198,14 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15]
-; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7]
-; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,16,18,0,2,16,18,1,3,17,19,1,3,17,19]
+; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
+; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u]
; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1}
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -1272,18 +1268,14 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,16,18,0,2,16,18,1,3,17,19,1,3,17,19]
+; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
+; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 81f79f3b1399a..da65fecba773b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -776,12 +776,34 @@ define <4 x double> @shuffle_v4f64_0044(<4 x double> %a, <4 x double> %b) {
}
define <4 x double> @shuffle_v4f64_0044_v2f64(<2 x double> %a, <2 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_0044_v2f64:
-; ALL: # %bb.0:
-; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v4f64_0044_v2f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4f64_0044_v2f64:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_0044_v2f64:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
+; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,4]
+; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_0044_v2f64:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-FAST-PERLANE-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%1 = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> <i32 0, i32 0>
%2 = shufflevector <2 x double> %b, <2 x double> poison, <2 x i32> <i32 0, i32 0>
%3 = shufflevector <2 x double> %1, <2 x double> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -789,12 +811,34 @@ define <4 x double> @shuffle_v4f64_0044_v2f64(<2 x double> %a, <2 x double> %b)
}
define <4 x double> @shuffle_v4f64_1032_v2f64(<2 x double> %a, <2 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_1032_v2f64:
-; ALL: # %bb.0:
-; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v4f64_1032_v2f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4f64_1032_v2f64:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_1032_v2f64:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
+; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4]
+; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_1032_v2f64:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%1 = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> <i32 1, i32 0>
%2 = shufflevector <2 x double> %b, <2 x double> poison, <2 x i32> <i32 1, i32 0>
%3 = shufflevector <2 x double> %1, <2 x double> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/widen_fadd.ll b/llvm/test/CodeGen/X86/widen_fadd.ll
index e2c36393da2f6..825ee34561c79 100644
--- a/llvm/test/CodeGen/X86/widen_fadd.ll
+++ b/llvm/test/CodeGen/X86/widen_fadd.ll
@@ -65,70 +65,26 @@ define void @widen_fadd_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
; SSE-NEXT: movlps %xmm2, 24(%rdx)
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: widen_fadd_v2f32_v8f32:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vaddps %xmm4, %xmm1, %xmm1
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vaddps %xmm4, %xmm2, %xmm2
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vaddps %xmm4, %xmm3, %xmm3
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1OR2-NEXT: vmovups %ymm0, (%rdx)
-; AVX1OR2-NEXT: vzeroupper
-; AVX1OR2-NEXT: retq
-;
-; AVX512F-LABEL: widen_fadd_v2f32_v8f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512F-NEXT: vmovups %ymm0, (%rdx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: widen_fadd_v2f32_v8f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512VL-NEXT: vmovups %ymm0, (%rdx)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX-LABEL: widen_fadd_v2f32_v8f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vaddps %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vaddps %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vaddps %xmm4, %xmm2, %xmm2
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vaddps %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT: vmovups %ymm0, (%rdx)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%a2 = getelementptr inbounds i8, ptr %a0, i64 8
%b2 = getelementptr inbounds i8, ptr %b0, i64 8
%c2 = getelementptr inbounds i8, ptr %c0, i64 8
diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll
index 4e5695500fbff..c0ec41237d301 100644
--- a/llvm/test/CodeGen/X86/widen_fdiv.ll
+++ b/llvm/test/CodeGen/X86/widen_fdiv.ll
@@ -65,44 +65,13 @@ define void @widen_fdiv_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
; SSE-NEXT: movlps %xmm3, 24(%rdx)
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: widen_fdiv_v2f32_v8f32:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovups (%rdi), %ymm0
-; AVX1OR2-NEXT: vdivps (%rsi), %ymm0, %ymm0
-; AVX1OR2-NEXT: vmovups %ymm0, (%rdx)
-; AVX1OR2-NEXT: vzeroupper
-; AVX1OR2-NEXT: retq
-;
-; AVX512F-LABEL: widen_fdiv_v2f32_v8f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovups (%rdi), %ymm0
-; AVX512F-NEXT: vdivps (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovups %ymm0, (%rdx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: widen_fdiv_v2f32_v8f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT: vdivps %xmm5, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512VL-NEXT: vdivps %xmm6, %xmm3, %xmm3
-; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vdivps %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512VL-NEXT: vmovups %ymm0, (%rdx)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX-LABEL: widen_fdiv_v2f32_v8f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vdivps (%rsi), %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, (%rdx)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%a2 = getelementptr inbounds i8, ptr %a0, i64 8
%b2 = getelementptr inbounds i8, ptr %b0, i64 8
%c2 = getelementptr inbounds i8, ptr %c0, i64 8
diff --git a/llvm/test/CodeGen/X86/widen_fmul.ll b/llvm/test/CodeGen/X86/widen_fmul.ll
index fc099e7c68969..7011419fbc6fc 100644
--- a/llvm/test/CodeGen/X86/widen_fmul.ll
+++ b/llvm/test/CodeGen/X86/widen_fmul.ll
@@ -65,70 +65,26 @@ define void @widen_fmul_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
; SSE-NEXT: movlps %xmm2, 24(%rdx)
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: widen_fmul_v2f32_v8f32:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vmulps %xmm4, %xmm0, %xmm0
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vmulps %xmm4, %xmm1, %xmm1
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vmulps %xmm4, %xmm2, %xmm2
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vmulps %xmm4, %xmm3, %xmm3
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1OR2-NEXT: vmovups %ymm0, (%rdx)
-; AVX1OR2-NEXT: vzeroupper
-; AVX1OR2-NEXT: retq
-;
-; AVX512F-LABEL: widen_fmul_v2f32_v8f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512F-NEXT: vmovups %ymm0, (%rdx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: widen_fmul_v2f32_v8f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512VL-NEXT: vmovups %ymm0, (%rdx)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX-LABEL: widen_fmul_v2f32_v8f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vmulps %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vmulps %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vmulps %xmm4, %xmm2, %xmm2
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vmulps %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT: vmovups %ymm0, (%rdx)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%a2 = getelementptr inbounds i8, ptr %a0, i64 8
%b2 = getelementptr inbounds i8, ptr %b0, i64 8
%c2 = getelementptr inbounds i8, ptr %c0, i64 8
diff --git a/llvm/test/CodeGen/X86/widen_fsub.ll b/llvm/test/CodeGen/X86/widen_fsub.ll
index 3256d5c6f5e3f..915f122b50386 100644
--- a/llvm/test/CodeGen/X86/widen_fsub.ll
+++ b/llvm/test/CodeGen/X86/widen_fsub.ll
@@ -65,70 +65,26 @@ define void @widen_fsub_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
; SSE-NEXT: movlps %xmm3, 24(%rdx)
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: widen_fsub_v2f32_v8f32:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vsubps %xmm4, %xmm0, %xmm0
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vsubps %xmm4, %xmm1, %xmm1
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vsubps %xmm4, %xmm2, %xmm2
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT: vsubps %xmm4, %xmm3, %xmm3
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1OR2-NEXT: vmovups %ymm0, (%rdx)
-; AVX1OR2-NEXT: vzeroupper
-; AVX1OR2-NEXT: retq
-;
-; AVX512F-LABEL: widen_fsub_v2f32_v8f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512F-NEXT: vmovups %ymm0, (%rdx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: widen_fsub_v2f32_v8f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512VL-NEXT: vmovups %ymm0, (%rdx)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX-LABEL: widen_fsub_v2f32_v8f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vsubps %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vsubps %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vsubps %xmm4, %xmm2, %xmm2
+; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vsubps %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT: vmovups %ymm0, (%rdx)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%a2 = getelementptr inbounds i8, ptr %a0, i64 8
%b2 = getelementptr inbounds i8, ptr %b0, i64 8
%c2 = getelementptr inbounds i8, ptr %c0, i64 8
More information about the llvm-commits
mailing list