[llvm] c1af6ab - [X86] getFauxShuffleMask - recognise CONCAT(SUB0, SUB1) style patterns
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 12 10:40:38 PDT 2024
Author: Simon Pilgrim
Date: 2024-03-12T17:40:19Z
New Revision: c1af6ab505a83bfb4fc8752591ad333190bc9389
URL: https://github.com/llvm/llvm-project/commit/c1af6ab505a83bfb4fc8752591ad333190bc9389
DIFF: https://github.com/llvm/llvm-project/commit/c1af6ab505a83bfb4fc8752591ad333190bc9389.diff
LOG: [X86] getFauxShuffleMask - recognise CONCAT(SUB0, SUB1) style patterns
Handles the INSERT_SUBVECTOR(INSERT_SUBVECTOR(UNDEF,SUB0,0),SUB1,N) pattern
Currently limited to v8i64/v8f64 cases as only AVX512 has decent cross lane 2-input shuffles, the plan is to relax this as I deal with some regressions
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 72b45d462dfee2..2b5e3c0379a138 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5858,6 +5858,23 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
Ops.push_back(SubBCSrc);
return true;
}
+ // Handle CONCAT(SUB0, SUB1).
+ // Limit this to vXi64 512-bit vector cases to make the most of AVX512
+ // cross lane shuffles.
+ if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
+ NumBitsPerElt == 64 && NumSizeInBits == 512 &&
+ Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(0).isUndef() &&
+ Src.getOperand(1).getValueType() == SubVT &&
+ Src.getConstantOperandVal(2) == 0) {
+ for (int i = 0; i != (int)NumSubElts; ++i)
+ Mask.push_back(i);
+ for (int i = 0; i != (int)NumSubElts; ++i)
+ Mask.push_back(i + NumElts);
+ Ops.push_back(Src.getOperand(1));
+ Ops.push_back(Sub);
+ return true;
+ }
// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubInputs;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 79cc8e49f1fdb1..9e70aef868858c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -821,9 +821,8 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,4,8,12,32,36,40,1,5,9,13,33,37,41,2,6,10,14,34,38,42,3,7,11,15,35,39,43,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm0
; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-FCP-NEXT: vmovq %xmm1, 48(%rax)
@@ -873,9 +872,8 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,4,8,12,32,36,40,1,5,9,13,33,37,41,2,6,10,14,34,38,42,3,7,11,15,35,39,43,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm0
; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 48(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
index 194b715b6594a6..32825f291e98b6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
@@ -762,10 +762,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
-; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
+; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -788,10 +787,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
+; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -814,10 +812,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
-; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
+; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -840,10 +837,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 837d990596a5af..45a76599d3e9d7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -227,9 +227,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
; AVX512-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0
; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
; AVX512-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512-FCP-NEXT: vmovq %xmm1, 48(%rax)
@@ -279,9 +278,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
; AVX512DQ-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0
; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
; AVX512DQ-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512DQ-FCP-NEXT: vmovq %xmm1, 48(%rax)
@@ -331,9 +329,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0]
-; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0
; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-FCP-NEXT: vmovq %xmm1, 48(%rax)
@@ -383,9 +380,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0
; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 48(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
index 955927eb769126..265f6daeb20038 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
@@ -160,24 +160,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
-; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rax)
+; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
+; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -186,24 +185,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
-; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax)
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
+; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -212,24 +210,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
-; AVX512DQ-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
+; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -238,24 +235,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
-; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -264,24 +260,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
-; AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
+; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -290,24 +285,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
-; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -316,24 +310,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-BW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
-; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -342,24 +335,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
-; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
index 38623c6ce0cb05..ded7c002c8735b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
@@ -94,105 +94,97 @@ define void @store_i64_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512-LABEL: store_i64_stride4_vf2:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-NEXT: vmovaps (%rdx), %xmm1
-; AVX512-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
-; AVX512-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7]
-; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; AVX512-NEXT: vmovaps %zmm0, (%r8)
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
+; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i64_stride4_vf2:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovaps (%rdx), %xmm1
-; AVX512-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
-; AVX512-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512-FCP-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7]
-; AVX512-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT: vmovaps %zmm0, (%r8)
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i64_stride4_vf2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovaps (%rdx), %xmm1
-; AVX512DQ-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
-; AVX512DQ-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vmovaps %zmm0, (%r8)
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i64_stride4_vf2:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovaps (%rdx), %xmm1
-; AVX512DQ-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i64_stride4_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
-; AVX512BW-NEXT: vmovaps (%rdx), %xmm1
-; AVX512BW-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
-; AVX512BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7]
-; AVX512BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%r8)
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
+; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: store_i64_stride4_vf2:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovaps (%rdx), %xmm1
-; AVX512BW-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
-; AVX512BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7]
-; AVX512BW-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: store_i64_stride4_vf2:
; AVX512DQ-BW: # %bb.0:
-; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovaps (%rdx), %xmm1
-; AVX512DQ-BW-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
-; AVX512DQ-BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r8)
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: store_i64_stride4_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdx), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64
More information about the llvm-commits
mailing list