[llvm] [DAG] visitEXTRACT_SUBVECTOR - don't return early on failure of EXTRACT_SUBVECTOR(INSERT_SUBVECTOR()) -> BITCAST fold (PR #133695)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 31 03:39:54 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/133695
>From 1e3c39821168acadb7f785df2f1353458a7ec2e7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 31 Mar 2025 11:34:24 +0100
Subject: [PATCH] [DAG] visitEXTRACT_SUBVECTOR - don't return early on failure
of EXTRACT_SUBVECTOR(INSERT_SUBVECTOR()) -> BITCAST fold
Always allow later folds to try to match as well.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 36 +-
.../AMDGPU/load-local-redundant-copies.ll | 45 +--
.../vector-interleaved-store-i16-stride-5.ll | 12 +-
.../vector-interleaved-store-i16-stride-7.ll | 200 +++++----
.../vector-interleaved-store-i8-stride-5.ll | 24 +-
.../vector-interleaved-store-i8-stride-7.ll | 380 +++++++++---------
6 files changed, 335 insertions(+), 362 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4487b9d510cc7..dc5c5f38e3bd8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -25532,26 +25532,24 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
// Handle only simple case where vector being inserted and vector
// being extracted are of same size.
EVT SmallVT = V.getOperand(1).getValueType();
- if (!NVT.bitsEq(SmallVT))
- return SDValue();
-
- // Combine:
- // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
- // Into:
- // indices are equal or bit offsets are equal => V1
- // otherwise => (extract_subvec V1, ExtIdx)
- uint64_t InsIdx = V.getConstantOperandVal(2);
- if (InsIdx * SmallVT.getScalarSizeInBits() ==
- ExtIdx * NVT.getScalarSizeInBits()) {
- if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT))
- return SDValue();
-
- return DAG.getBitcast(NVT, V.getOperand(1));
+ if (NVT.bitsEq(SmallVT)) {
+ // Combine:
+ // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
+ // Into:
+ // indices are equal or bit offsets are equal => V1
+ // otherwise => (extract_subvec V1, ExtIdx)
+ uint64_t InsIdx = V.getConstantOperandVal(2);
+ if (InsIdx * SmallVT.getScalarSizeInBits() ==
+ ExtIdx * NVT.getScalarSizeInBits()) {
+ if (!LegalOperations || TLI.isOperationLegal(ISD::BITCAST, NVT))
+ return DAG.getBitcast(NVT, V.getOperand(1));
+ } else {
+ return DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, DL, NVT,
+ DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
+ N->getOperand(1));
+ }
}
- return DAG.getNode(
- ISD::EXTRACT_SUBVECTOR, DL, NVT,
- DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
- N->getOperand(1));
}
if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations))
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
index 64d7f93760fd5..a6ce512164b89 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
@@ -66,39 +66,38 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, ptr addrspace(8)
; CHECK-NEXT: s_mov_b32 s6, s4
; CHECK-NEXT: s_mov_b32 s5, s3
; CHECK-NEXT: s_mov_b32 s4, s2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, 20, v1
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 16, v1
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, 12, v1
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, 4, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, 12, v1
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 8, v1
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, 20, v1
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1
; CHECK-NEXT: v_mov_b32_e32 v9, s0
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, 20, v2
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, 16, v2
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, 12, v2
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2
; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: ds_read_b32 v7, v3
-; CHECK-NEXT: ds_read_b32 v6, v4
-; CHECK-NEXT: ds_read_b32 v5, v5
-; CHECK-NEXT: ds_read_b32 v4, v8
-; CHECK-NEXT: ds_read_b32 v8, v0
+; CHECK-NEXT: ds_read_b32 v5, v3
+; CHECK-NEXT: ds_read_b32 v4, v4
+; CHECK-NEXT: ds_read_b32 v8, v6
+; CHECK-NEXT: ds_read_b32 v7, v7
+; CHECK-NEXT: ds_read_b32 v6, v0
; CHECK-NEXT: ds_read_b32 v3, v1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, 12, v2
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, 4, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v2
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, 20, v2
+; CHECK-NEXT: v_add_i32_e32 v12, vcc, 16, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
-; CHECK-NEXT: ds_read_b32 v0, v11
; CHECK-NEXT: s_waitcnt expcnt(1)
-; CHECK-NEXT: ds_read_b32 v5, v1
-; CHECK-NEXT: ds_read_b32 v4, v12
-; CHECK-NEXT: ds_read_b32 v3, v13
+; CHECK-NEXT: ds_read_b32 v4, v11
+; CHECK-NEXT: ds_read_b32 v3, v0
+; CHECK-NEXT: ds_read_b32 v1, v1
+; CHECK-NEXT: ds_read_b32 v0, v12
+; CHECK-NEXT: ds_read_b32 v5, v10
; CHECK-NEXT: ds_read_b32 v2, v2
-; CHECK-NEXT: ds_read_b32 v1, v10
-; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: exp mrt0 off, off, off, off
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
-; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
; CHECK-NEXT: s_endpgm
%load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 0df63422b5d84..e4fa594f3dd72 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -449,9 +449,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512-NEXT: vmovdqa %ymm2, (%r9)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -476,9 +475,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512-FCP-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -504,9 +502,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512DQ-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512DQ-NEXT: vmovdqa %ymm2, (%r9)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -531,9 +528,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512DQ-FCP-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index bc08f57e5faac..e4e013446f7a5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -1380,29 +1380,28 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10
; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9))
-; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm4
-; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u]
+; AVX512-NEXT: vpor %ymm2, %ymm7, %ymm2
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2))
+; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm2
+; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
; AVX512-NEXT: vpsrld $16, %xmm6, %xmm1
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512-NEXT: vpbroadcastd 12(%r10), %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6],xmm3[7]
+; AVX512-NEXT: vpbroadcastd 12(%r10), %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
-; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
; AVX512-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX512-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512-NEXT: vmovdqa %ymm8, 64(%rax)
; AVX512-NEXT: vmovdqa64 %zmm11, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1420,6 +1419,37 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7
; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,0]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,5,0,0,5,2,6,0]
+; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[20,21,24,25]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
+; AVX512-FCP-NEXT: vporq %zmm9, %zmm10, %zmm9
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
+; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm11
+; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u],zero,zero,zero,zero,ymm7[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[20,21,28,29,u,u,u,u]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,u,u]
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6))
; AVX512-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3
; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -1430,41 +1460,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,3,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
-; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,0,0,5,2,6,0]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-FCP-NEXT: vporq %zmm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm3[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm3[u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,4,5,12,13],zero,zero,ymm3[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm3[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm4
-; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & mem) | zmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2))
; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm8, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -1505,29 +1503,28 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9))
-; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u]
+; AVX512DQ-NEXT: vpor %ymm2, %ymm7, %ymm2
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2))
+; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm2
+; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
; AVX512DQ-NEXT: vpsrld $16, %xmm6, %xmm1
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512DQ-NEXT: vpbroadcastd 12(%r10), %xmm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6],xmm3[7]
+; AVX512DQ-NEXT: vpbroadcastd 12(%r10), %xmm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
-; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512DQ-NEXT: vmovdqa %ymm8, 64(%rax)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1545,6 +1542,37 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,0]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,5,0,0,5,2,6,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[20,21,24,25]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT: vporq %zmm9, %zmm10, %zmm9
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
+; AVX512DQ-FCP-NEXT: vpbroadcastd (%r10), %ymm11
+; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u],zero,zero,zero,zero,ymm7[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[20,21,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6))
; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3
; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -1555,41 +1583,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,3,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,0,0,5,2,6,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT: vporq %zmm2, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm3[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm3[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,4,5,12,13],zero,zero,ymm3[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm3[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT: vpbroadcastd (%r10), %ymm4
-; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & mem) | zmm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2))
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 75f8469c266b1..39f8a93a7b77a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -725,9 +725,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-NEXT: vmovd %eax, %xmm1
; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
-; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0
; AVX512-NEXT: vmovq %xmm1, 32(%r9)
-; AVX512-NEXT: vmovdqa %ymm0, (%r9)
+; AVX512-NEXT: vmovdqa %ymm3, (%r9)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -756,9 +755,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-FCP-NEXT: vmovd %eax, %xmm1
; AVX512-FCP-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0
; AVX512-FCP-NEXT: vmovq %xmm1, 32(%r9)
-; AVX512-FCP-NEXT: vmovdqa %ymm0, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %ymm3, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -787,9 +785,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-NEXT: vmovd %eax, %xmm1
; AVX512DQ-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0
; AVX512DQ-NEXT: vmovq %xmm1, 32(%r9)
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%r9)
+; AVX512DQ-NEXT: vmovdqa %ymm3, (%r9)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -818,9 +815,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-FCP-NEXT: vmovd %eax, %xmm1
; AVX512DQ-FCP-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0
; AVX512DQ-FCP-NEXT: vmovq %xmm1, 32(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -852,9 +848,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-NEXT: movw $132, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512BW-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512BW-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512BW-NEXT: vmovdqa %ymm2, (%r9)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -886,9 +881,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-NEXT: movw $132, %ax
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
; AVX512BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512BW-FCP-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -920,9 +914,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-NEXT: movw $132, %ax
; AVX512DQ-BW-NEXT: kmovd %eax, %k1
; AVX512DQ-BW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512DQ-BW-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%r9)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -954,9 +947,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-NEXT: movw $132, %ax
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%r9)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 328d55ca8d627..b82e663528398 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -2098,10 +2098,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1
+; AVX512-NEXT: vmovdqa %ymm5, 64(%rax)
; AVX512-NEXT: vmovdqa %xmm0, 96(%rax)
; AVX512-NEXT: vmovdqa64 %zmm8, (%rax)
-; AVX512-NEXT: vmovdqa %ymm1, 64(%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -2164,10 +2163,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1
+; AVX512-FCP-NEXT: vmovdqa %ymm5, 64(%rax)
; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -2235,10 +2233,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1
+; AVX512DQ-NEXT: vmovdqa %ymm5, 64(%rax)
; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax)
-; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -2301,10 +2298,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, 64(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -2314,80 +2310,79 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512BW-NEXT: vmovdqa (%r8), %xmm3
-; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2
-; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm4
-; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm0
-; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,7,7,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2]
+; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25]
+; AVX512BW-NEXT: vpor %ymm5, %ymm4, %ymm4
+; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060
+; AVX512BW-NEXT: kmovd %ecx, %k1
+; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28]
+; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[4],zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero
+; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
+; AVX512BW-NEXT: vpermw %zmm5, %zmm6, %zmm6
+; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
+; AVX512BW-NEXT: kmovq %rcx, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
+; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
+; AVX512BW-NEXT: kmovq %rcx, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,7,7,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,2]
; AVX512BW-NEXT: movw $-32510, %cx # imm = 0x8102
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,2,3]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero
-; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5
+; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[1,3,2,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[6,14],zero,zero,zero,zero,zero,xmm7[7,15],zero,zero,zero
+; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1}
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512BW-NEXT: vpermw %ymm1, %ymm3, %ymm3
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,1,3]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm6 {%k1}
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512BW-NEXT: vpermw %ymm5, %ymm4, %ymm4
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
; AVX512BW-NEXT: movl $67637280, %ecx # imm = 0x4081020
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,3,1]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[3,1,1,3]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
-; AVX512BW-NEXT: vpor %ymm3, %ymm7, %ymm3
+; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm2 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1}
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpor %ymm6, %ymm7, %ymm6
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero,zero,ymm8[25]
-; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060
-; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm7 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX512BW-NEXT: vpor %ymm4, %ymm2, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero
-; AVX512BW-NEXT: vpor %ymm4, %ymm6, %ymm4
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm1
-; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
-; AVX512BW-NEXT: kmovq %rcx, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
-; AVX512BW-NEXT: kmovq %rcx, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm5, 96(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax)
-; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax)
+; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1}
+; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512BW-NEXT: vmovdqa %xmm6, 96(%rax)
+; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -2417,43 +2412,42 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
; AVX512BW-FCP-NEXT: vmovdqu8 %xmm5, %xmm4 {%k1}
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[1,3,1,3]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
-; AVX512BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020
-; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm7 {%k1}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,3,1]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[3,19],zero,zero,zero,zero,zero,ymm5[28,20],zero,zero,zero,zero,zero,ymm5[29,21],zero
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
-; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm0 {%k1}
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm0
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
; AVX512BW-FCP-NEXT: vpermw %zmm6, %zmm5, %zmm5
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,4,0,1,5,0,0,1,5,2,6]
-; AVX512BW-FCP-NEXT: vpermd %zmm2, %zmm6, %zmm2
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,0,4,0,1,5,0,0,1,5,2,6]
+; AVX512BW-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm7
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u]
; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
; AVX512BW-FCP-NEXT: kmovq %rcx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm7 {%k1}
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,1,4,5,1,5,0,0,1,5,2,6]
; AVX512BW-FCP-NEXT: vpermd %zmm3, %zmm5, %zmm3
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57]
-; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero
-; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zero,zero,zmm5[1,9],zero,zero,zero,zero,zero,zmm5[18,26],zero,zero,zero,zero,zero,zmm5[19,27],zero,zero,zero,zero,zero,zmm5[20,28],zero,zero,zero,zero,zero,zmm5[33,37],zero,zero,zero,zero,zero,zmm5[34,38],zero,zero,zero,zero,zero,zmm5[51,55],zero,zero,zero,zero,zero,zmm5[56,60],zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm5, %zmm3
; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
; AVX512BW-FCP-NEXT: kmovq %rcx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm4, 96(%rax)
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX512BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020
+; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm2 {%k1}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
+; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1}
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm4, 96(%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -2463,80 +2457,79 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm3
-; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2
-; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm0
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
-; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,7,7,7]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2]
+; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25]
+; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm4, %ymm4
+; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060
+; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28]
+; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[4],zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
+; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
+; AVX512DQ-BW-NEXT: kmovq %rcx, %k1
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
+; AVX512DQ-BW-NEXT: kmovq %rcx, %k1
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,7,7,7]
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,2]
; AVX512DQ-BW-NEXT: movw $-32510, %cx # imm = 0x8102
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,2,3]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5
+; AVX512DQ-BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[1,3,2,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[6,14],zero,zero,zero,zero,zero,xmm7[7,15],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1}
-; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm3, %ymm3
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,1,3]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX512DQ-BW-NEXT: vmovdqu8 %xmm4, %xmm6 {%k1}
+; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-BW-NEXT: vpermw %ymm5, %ymm4, %ymm4
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
; AVX512DQ-BW-NEXT: movl $67637280, %ecx # imm = 0x4081020
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,3,1]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[3,1,1,3]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm7, %ymm3
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm2 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1}
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm7, %ymm6
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero,zero,ymm8[25]
-; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060
-; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm7 {%k1}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm2, %ymm2
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm6, %ymm4
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm1
-; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
-; AVX512DQ-BW-NEXT: kmovq %rcx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
-; AVX512DQ-BW-NEXT: kmovq %rcx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa %xmm5, 96(%rax)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax)
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm6, 96(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -2566,43 +2559,42 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm5, %xmm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[1,3,1,3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
-; AVX512DQ-BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020
-; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm7 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,3,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[3,19],zero,zero,zero,zero,zero,ymm5[28,20],zero,zero,zero,zero,zero,ymm5[29,21],zero
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
-; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm0
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm6, %zmm5, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,4,0,1,5,0,0,1,5,2,6]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm2, %zmm6, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,0,4,0,1,5,0,0,1,5,2,6]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u]
; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm7 {%k1}
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,1,4,5,1,5,0,0,1,5,2,6]
; AVX512DQ-BW-FCP-NEXT: vpermd %zmm3, %zmm5, %zmm3
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zero,zero,zmm5[1,9],zero,zero,zero,zero,zero,zmm5[18,26],zero,zero,zero,zero,zero,zmm5[19,27],zero,zero,zero,zero,zero,zmm5[20,28],zero,zero,zero,zero,zero,zmm5[33,37],zero,zero,zero,zero,zero,zmm5[34,38],zero,zero,zero,zero,zero,zmm5[51,55],zero,zero,zero,zero,zero,zmm5[56,60],zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm5, %zmm3
; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, 96(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX512DQ-BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020
+; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
+; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, 96(%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
More information about the llvm-commits
mailing list