[llvm] 6bc3c9e - [X86] combineX86ShuffleChain - always create VPERMV3 nodes if started from a VPERMV3 node

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 6 08:25:31 PST 2024


Author: Simon Pilgrim
Date: 2024-12-06T16:25:15Z
New Revision: 6bc3c9ee6bfbace7275dedfddef1cb8f177aa491

URL: https://github.com/llvm/llvm-project/commit/6bc3c9ee6bfbace7275dedfddef1cb8f177aa491
DIFF: https://github.com/llvm/llvm-project/commit/6bc3c9ee6bfbace7275dedfddef1cb8f177aa491.diff

LOG: [X86] combineX86ShuffleChain - always create VPERMV3 nodes if started from a VPERMV3 node

If the root shuffle node was a VPERMV3 node, then we can always replace it with a new VPERMV3 node - it doesn't matter if other variable shuffles in the chain had multiple uses.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f66a7a80d027ec..bfab89e55fb1da 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39835,6 +39835,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   bool AllowBWIVPERMV3 =
       (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
 
+  // If root was a VPERMV3 node, always allow a variable shuffle.
+  if (Root.getOpcode() == X86ISD::VPERMV3)
+    AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
+
   bool MaskContainsZeros = isAnyZero(Mask);
 
   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index 8dded96acfc12c..bb609b9938a821 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -123,12 +123,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,9]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,9]
+; AVX512-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
-; AVX512-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512-NEXT:    vmovdqa64 %zmm3, (%r9)
-; AVX512-NEXT:    vmovdqa %xmm1, 64(%r9)
+; AVX512-NEXT:    vmovdqa %xmm3, 64(%r9)
+; AVX512-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -140,12 +140,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,9]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,9]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, 64(%r9)
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, 64(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -157,12 +157,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,9]
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,9]
+; AVX512DQ-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
 ; AVX512DQ-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
-; AVX512DQ-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%r9)
-; AVX512DQ-NEXT:    vmovdqa %xmm1, 64(%r9)
+; AVX512DQ-NEXT:    vmovdqa %xmm3, 64(%r9)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -174,12 +174,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,9]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,9]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, 64(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, 64(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -191,12 +191,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,9]
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,9]
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%r9)
-; AVX512BW-NEXT:    vmovdqa %xmm1, 64(%r9)
+; AVX512BW-NEXT:    vmovdqa %xmm3, 64(%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -208,12 +208,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,9]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,9]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm1, 64(%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, 64(%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -225,12 +225,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,9]
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,9]
+; AVX512DQ-BW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
 ; AVX512DQ-BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
-; AVX512DQ-BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%r9)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, 64(%r9)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, 64(%r9)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -242,12 +242,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,9]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,9]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm1, 64(%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, 64(%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
index e70975addc6760..c2f1723d8031ed 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
@@ -138,13 +138,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
-; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512-NEXT:    vmovdqa %ymm2, 64(%rax)
+; AVX512-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
+; AVX512-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
+; AVX512-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -157,13 +157,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
+; AVX512-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -176,13 +176,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
-; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
-; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512DQ-NEXT:    vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
+; AVX512DQ-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -195,13 +195,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -214,13 +214,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
-; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512BW-NEXT:    vmovdqa %ymm2, 64(%rax)
+; AVX512BW-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -233,13 +233,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -252,13 +252,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
-; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
-; AVX512DQ-BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
+; AVX512DQ-BW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -271,13 +271,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm3, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index b87abded10819c..941b18db0931ad 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -5585,12 +5585,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-LABEL: store_i8_stride5_vf64:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm21
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm21, %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
+; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm21
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm21, %ymm2
 ; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm11
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
@@ -5598,77 +5598,76 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm3
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm12
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm4
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm4, %xmm5
 ; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm3, %xmm9
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm3, %xmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm18
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm5, %xmm10
 ; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm18 = ymm9[0,0,1,1]
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm19
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1]
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm16
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
-; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm22
+; AVX512BW-FCP-NEXT:    vpermd %ymm16, %ymm9, %ymm22
 ; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm23
 ; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX512BW-FCP-NEXT:    movl $138547332, %eax # imm = 0x8421084
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm23, %ymm22 {%k1}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm18, %zmm18
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm15, %zmm15
 ; AVX512BW-FCP-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %ymm22
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [1,1,2,2,2,2,2,2,27,27,27,27,0,28,28,28]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm22, %zmm18
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k2}
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12]
+; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm15, %zmm15
 ; AVX512BW-FCP-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm1 {%k3}
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm18, %zmm23, %zmm23
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm19[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm19, %zmm24, %zmm24
-; AVX512BW-FCP-NEXT:    vporq %zmm23, %zmm24, %zmm23
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm23 = zmm23[2,2,3,3,6,6,7,7]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k3}
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm15, %zmm22, %zmm22
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm16[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm16, %zmm23, %zmm23
+; AVX512BW-FCP-NEXT:    vporq %zmm22, %zmm23, %zmm22
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7]
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm23, %zmm8, %zmm8
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm24, %zmm21, %zmm21
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm25, %zmm8, %zmm8
-; AVX512BW-FCP-NEXT:    vporq %zmm21, %zmm8, %zmm8
+; AVX512BW-FCP-NEXT:    vporq %zmm8, %zmm21, %zmm8
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7]
 ; AVX512BW-FCP-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm23, %zmm8 {%k3}
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
-; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm22, %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm8 {%k3}
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15]
+; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm21
 ; AVX512BW-FCP-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k4
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm21, %zmm8 {%k4}
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
-; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm12, %xmm17
+; AVX512BW-FCP-NEXT:    vporq %xmm14, %xmm17, %xmm14
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm11, %zmm11
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm13, %xmm11
-; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm16, %xmm15
-; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm15, %xmm11
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm16[0],xmm13[1],xmm16[1],xmm13[2],xmm16[2],xmm13[3],xmm16[3],xmm13[4],xmm16[4],xmm13[5],xmm16[5],xmm13[6],xmm16[6],xmm13[7],xmm16[7]
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm13, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm18, %xmm17
+; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm17, %xmm11
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm18[0],xmm13[1],xmm18[1],xmm13[2],xmm18[2],xmm13[3],xmm18[3],xmm13[4],xmm18[4],xmm13[5],xmm18[5],xmm13[6],xmm18[6],xmm13[7],xmm18[7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm13, %xmm13
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm13, %zmm11
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5]
 ; AVX512BW-FCP-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
@@ -5682,23 +5681,23 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm11 {%k4}
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm14
 ; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm14, %ymm6
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %ymm16
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm16, %ymm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %ymm18
+; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm18, %ymm7
 ; AVX512BW-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
-; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm14, %ymm7
-; AVX512BW-FCP-NEXT:    vpshufb %ymm25, %ymm16, %ymm17
-; AVX512BW-FCP-NEXT:    vporq %ymm7, %ymm17, %ymm7
+; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm14, %ymm7
+; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm18, %ymm19
+; AVX512BW-FCP-NEXT:    vporq %ymm7, %ymm19, %ymm7
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm7
-; AVX512BW-FCP-NEXT:    vpshufb %ymm18, %ymm7, %ymm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm18
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm18, %ymm19
-; AVX512BW-FCP-NEXT:    vporq %ymm17, %ymm19, %ymm17
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[2,2,3,3]
-; AVX512BW-FCP-NEXT:    vpermd %ymm18, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm15
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512BW-FCP-NEXT:    vpshufb %ymm16, %ymm19, %ymm16
+; AVX512BW-FCP-NEXT:    vporq %ymm15, %ymm16, %ymm15
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm9
 ; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm9 {%k1}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm9, %zmm9
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm9, %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm9 {%k2}
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14]
 ; AVX512BW-FCP-NEXT:    vpermd %zmm13, %zmm6, %zmm6
@@ -5707,15 +5706,15 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm9 {%k1}
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm16[27],zero,zero,ymm16[26],zero,ymm16[28],zero,ymm16[30],zero,zero,ymm16[29],zero,ymm16[31],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero,zero
 ; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm4
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm2, %xmm2
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm18[26],zero,ymm18[28],zero,zero,ymm18[27],zero,ymm18[29],zero,ymm18[31],zero,zero,ymm18[30],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[26],zero,ymm19[28],zero,zero,ymm19[27],zero,ymm19[29],zero,ymm19[31],zero,zero,ymm19[30],zero
 ; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k3}
@@ -5909,12 +5908,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm21, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm21
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm21, %ymm2
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
@@ -5922,77 +5921,76 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm12
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm4, %xmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm3, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm3, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm18
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm5, %xmm10
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm18 = ymm9[0,0,1,1]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm19
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm16
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm16, %ymm9, %ymm22
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm23
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX512DQ-BW-FCP-NEXT:    movl $138547332, %eax # imm = 0x8421084
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm23, %ymm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm18, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm15, %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %ymm22
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [1,1,2,2,2,2,2,2,27,27,27,27,0,28,28,28]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm22, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm15, %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm1 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm18, %zmm23, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm19[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm19, %zmm24, %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vporq %zmm23, %zmm24, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm23 = zmm23[2,2,3,3,6,6,7,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm15, %zmm22, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm16[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm16, %zmm23, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vporq %zmm22, %zmm23, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7]
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm23, %zmm8, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm24, %zmm21, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm25, %zmm8, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vporq %zmm21, %zmm8, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vporq %zmm8, %zmm21, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7]
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm23, %zmm8 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm22, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm8 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm21
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm21, %zmm8 {%k4}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm12, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm14, %xmm17, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm11, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm13, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm16, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm15, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm16[0],xmm13[1],xmm16[1],xmm13[2],xmm16[2],xmm13[3],xmm16[3],xmm13[4],xmm16[4],xmm13[5],xmm16[5],xmm13[6],xmm16[6],xmm13[7],xmm16[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm13, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm18, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm17, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm18[0],xmm13[1],xmm18[1],xmm13[2],xmm18[2],xmm13[3],xmm18[3],xmm13[4],xmm18[4],xmm13[5],xmm18[5],xmm13[6],xmm18[6],xmm13[7],xmm18[7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm13, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm13, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5]
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
@@ -6006,23 +6004,23 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm11 {%k4}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm14
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm14, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm16, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm18, %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm14, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm25, %ymm16, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm7, %ymm17, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm14, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm18, %ymm19
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm7, %ymm19, %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm18, %ymm7, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm18, %ymm19
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm17, %ymm19, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[2,2,3,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm18, %ymm9, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm16, %ymm19, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm15, %ymm16, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm9
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm9 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm9, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm9, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm9 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14]
 ; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm13, %zmm6, %zmm6
@@ -6031,15 +6029,15 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm9 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm16[27],zero,zero,ymm16[26],zero,ymm16[28],zero,ymm16[30],zero,zero,ymm16[29],zero,ymm16[31],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm2, %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm18[26],zero,ymm18[28],zero,zero,ymm18[27],zero,ymm18[29],zero,ymm18[31],zero,zero,ymm18[30],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[26],zero,ymm19[28],zero,zero,ymm19[27],zero,ymm19[29],zero,ymm19[31],zero,zero,ymm19[30],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k3}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index 4d9cc3c8a7dcbc..218a492fb0e427 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -1599,15 +1599,15 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,8,10,9,11]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58,u,u]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [4,6,4,6,8,10,9,11]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm3, %zmm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,2,0,2,12,14,13,15]
+; AVX512BW-FCP-NEXT:    vpermi2q %zmm3, %zmm3, %zmm5
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zmm5[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58]
 ; AVX512BW-FCP-NEXT:    movl $1227105426, %ecx # imm = 0x49242492
 ; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm5, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm4, %zmm3 {%k1}
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3]
 ; AVX512BW-FCP-NEXT:    vpermq %zmm2, %zmm4, %zmm4
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u]
@@ -1685,15 +1685,15 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,8,10,9,11]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [4,6,4,6,8,10,9,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm3, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,2,0,2,12,14,13,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm3, %zmm3, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zmm5[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58]
 ; AVX512DQ-BW-FCP-NEXT:    movl $1227105426, %ecx # imm = 0x49242492
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm5, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm4, %zmm3 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm2, %zmm4, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u]


        


More information about the llvm-commits mailing list