[llvm] [X86] splitVector - use collectConcatOps to find pre-split subvectors (PR #142774)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 4 06:27:19 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/142774
Don't just match ISD::CONCAT_VECTORS - this matches more closely with isFreeToSplitVector
>From 786fa9ffe54b93b41d3f10a5487936c275645422 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 4 Jun 2025 14:26:05 +0100
Subject: [PATCH] [X86] splitVector - use collectConcatOps to find pre-split
subvectors
Don't just match ISD::CONCAT_VECTORS - this matches more closely with isFreeToSplitVector
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 12 +-
.../vector-interleaved-load-i16-stride-2.ll | 84 ++--
.../vector-interleaved-store-i16-stride-2.ll | 432 +++++++++---------
.../vector-interleaved-store-i32-stride-3.ll | 4 +-
.../vector-interleaved-store-i32-stride-4.ll | 4 +-
.../vector-interleaved-store-i32-stride-5.ll | 4 +-
.../vector-interleaved-store-i32-stride-7.ll | 10 +-
.../vector-interleaved-store-i32-stride-8.ll | 8 +-
.../vector-interleaved-store-i8-stride-2.ll | 144 +++---
.../vector-interleaved-store-i8-stride-3.ll | 189 ++++----
.../zero_extend_vector_inreg_of_broadcast.ll | 184 ++++----
11 files changed, 551 insertions(+), 524 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 24df848f87b9b..edf68964db833 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4349,13 +4349,13 @@ static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
"Can't split odd sized vector");
- if (Op.getOpcode() == ISD::CONCAT_VECTORS) {
- assert((Op.getNumOperands() % 2) == 0 &&
- "Can't split odd sized vector concat");
- unsigned HalfOps = Op.getNumOperands() / 2;
+ SmallVector<SDValue, 4> SubOps;
+ if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
+ assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
+ unsigned HalfOps = SubOps.size() / 2;
EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
- SmallVector<SDValue, 2> LoOps(Op->op_begin(), Op->op_begin() + HalfOps);
- SmallVector<SDValue, 2> HiOps(Op->op_begin() + HalfOps, Op->op_end());
+ SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
+ SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
return std::make_pair(Lo, Hi);
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index b3d8d05f69947..dbb4b9f64f4b7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -1105,19 +1105,18 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX512-VL-NEXT: vmovdqa64 64(%rdi), %zmm1
; AVX512-VL-NEXT: vmovdqa64 128(%rdi), %zmm2
; AVX512-VL-NEXT: vmovdqa64 192(%rdi), %zmm3
-; AVX512-VL-NEXT: vpmovdw %zmm1, %ymm4
-; AVX512-VL-NEXT: vpsrld $16, %zmm1, %zmm1
-; AVX512-VL-NEXT: vpsrld $16, %zmm0, %zmm5
-; AVX512-VL-NEXT: vpsrld $16, %zmm3, %zmm6
-; AVX512-VL-NEXT: vpsrld $16, %zmm2, %zmm7
+; AVX512-VL-NEXT: vpsrld $16, %zmm0, %zmm4
+; AVX512-VL-NEXT: vpsrld $16, %zmm1, %zmm5
+; AVX512-VL-NEXT: vpsrld $16, %zmm2, %zmm6
+; AVX512-VL-NEXT: vpsrld $16, %zmm3, %zmm7
+; AVX512-VL-NEXT: vpmovdw %zmm1, 32(%rsi)
; AVX512-VL-NEXT: vpmovdw %zmm0, (%rsi)
-; AVX512-VL-NEXT: vmovdqa %ymm4, 32(%rsi)
-; AVX512-VL-NEXT: vpmovdw %zmm2, 64(%rsi)
; AVX512-VL-NEXT: vpmovdw %zmm3, 96(%rsi)
-; AVX512-VL-NEXT: vpmovdw %zmm7, 64(%rdx)
-; AVX512-VL-NEXT: vpmovdw %zmm6, 96(%rdx)
-; AVX512-VL-NEXT: vpmovdw %zmm5, (%rdx)
-; AVX512-VL-NEXT: vpmovdw %zmm1, 32(%rdx)
+; AVX512-VL-NEXT: vpmovdw %zmm2, 64(%rsi)
+; AVX512-VL-NEXT: vpmovdw %zmm7, 96(%rdx)
+; AVX512-VL-NEXT: vpmovdw %zmm6, 64(%rdx)
+; AVX512-VL-NEXT: vpmovdw %zmm5, 32(%rdx)
+; AVX512-VL-NEXT: vpmovdw %zmm4, (%rdx)
; AVX512-VL-NEXT: vzeroupper
; AVX512-VL-NEXT: retq
;
@@ -1127,19 +1126,18 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
-; AVX512-FCP-NEXT: vpmovdw %zmm1, %ymm4
-; AVX512-FCP-NEXT: vpsrld $16, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vpsrld $16, %zmm0, %zmm5
-; AVX512-FCP-NEXT: vpsrld $16, %zmm3, %zmm6
-; AVX512-FCP-NEXT: vpsrld $16, %zmm2, %zmm7
+; AVX512-FCP-NEXT: vpsrld $16, %zmm0, %zmm4
+; AVX512-FCP-NEXT: vpsrld $16, %zmm1, %zmm5
+; AVX512-FCP-NEXT: vpsrld $16, %zmm2, %zmm6
+; AVX512-FCP-NEXT: vpsrld $16, %zmm3, %zmm7
+; AVX512-FCP-NEXT: vpmovdw %zmm1, 32(%rsi)
; AVX512-FCP-NEXT: vpmovdw %zmm0, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %ymm4, 32(%rsi)
-; AVX512-FCP-NEXT: vpmovdw %zmm2, 64(%rsi)
; AVX512-FCP-NEXT: vpmovdw %zmm3, 96(%rsi)
-; AVX512-FCP-NEXT: vpmovdw %zmm7, 64(%rdx)
-; AVX512-FCP-NEXT: vpmovdw %zmm6, 96(%rdx)
-; AVX512-FCP-NEXT: vpmovdw %zmm5, (%rdx)
-; AVX512-FCP-NEXT: vpmovdw %zmm1, 32(%rdx)
+; AVX512-FCP-NEXT: vpmovdw %zmm2, 64(%rsi)
+; AVX512-FCP-NEXT: vpmovdw %zmm7, 96(%rdx)
+; AVX512-FCP-NEXT: vpmovdw %zmm6, 64(%rdx)
+; AVX512-FCP-NEXT: vpmovdw %zmm5, 32(%rdx)
+; AVX512-FCP-NEXT: vpmovdw %zmm4, (%rdx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -1149,19 +1147,18 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2
; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
-; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm4
-; AVX512DQ-NEXT: vpsrld $16, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm5
-; AVX512DQ-NEXT: vpsrld $16, %zmm3, %zmm6
-; AVX512DQ-NEXT: vpsrld $16, %zmm2, %zmm7
+; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm4
+; AVX512DQ-NEXT: vpsrld $16, %zmm1, %zmm5
+; AVX512DQ-NEXT: vpsrld $16, %zmm2, %zmm6
+; AVX512DQ-NEXT: vpsrld $16, %zmm3, %zmm7
+; AVX512DQ-NEXT: vpmovdw %zmm1, 32(%rsi)
; AVX512DQ-NEXT: vpmovdw %zmm0, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %ymm4, 32(%rsi)
-; AVX512DQ-NEXT: vpmovdw %zmm2, 64(%rsi)
; AVX512DQ-NEXT: vpmovdw %zmm3, 96(%rsi)
-; AVX512DQ-NEXT: vpmovdw %zmm7, 64(%rdx)
-; AVX512DQ-NEXT: vpmovdw %zmm6, 96(%rdx)
-; AVX512DQ-NEXT: vpmovdw %zmm5, (%rdx)
-; AVX512DQ-NEXT: vpmovdw %zmm1, 32(%rdx)
+; AVX512DQ-NEXT: vpmovdw %zmm2, 64(%rsi)
+; AVX512DQ-NEXT: vpmovdw %zmm7, 96(%rdx)
+; AVX512DQ-NEXT: vpmovdw %zmm6, 64(%rdx)
+; AVX512DQ-NEXT: vpmovdw %zmm5, 32(%rdx)
+; AVX512DQ-NEXT: vpmovdw %zmm4, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -1171,19 +1168,18 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, %ymm4
-; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm2, %zmm7
+; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm1, %zmm5
+; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm2, %zmm6
+; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm3, %zmm7
+; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, 32(%rsi)
; AVX512DQ-FCP-NEXT: vpmovdw %zmm0, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, 32(%rsi)
-; AVX512DQ-FCP-NEXT: vpmovdw %zmm2, 64(%rsi)
; AVX512DQ-FCP-NEXT: vpmovdw %zmm3, 96(%rsi)
-; AVX512DQ-FCP-NEXT: vpmovdw %zmm7, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vpmovdw %zmm6, 96(%rdx)
-; AVX512DQ-FCP-NEXT: vpmovdw %zmm5, (%rdx)
-; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, 32(%rdx)
+; AVX512DQ-FCP-NEXT: vpmovdw %zmm2, 64(%rsi)
+; AVX512DQ-FCP-NEXT: vpmovdw %zmm7, 96(%rdx)
+; AVX512DQ-FCP-NEXT: vpmovdw %zmm6, 64(%rdx)
+; AVX512DQ-FCP-NEXT: vpmovdw %zmm5, 32(%rdx)
+; AVX512DQ-FCP-NEXT: vpmovdw %zmm4, (%rdx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll
index a034363895c0e..4a99ebecab5c8 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll
@@ -445,14 +445,14 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1
; AVX512-NEXT: vmovdqa (%rdi), %xmm2
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm4, 16(%rdx)
+; AVX512-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx)
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i16_stride2_vf16:
@@ -461,14 +461,14 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm4, 16(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i16_stride2_vf16:
@@ -477,14 +477,14 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm4, 16(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx)
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i16_stride2_vf16:
@@ -493,14 +493,14 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 16(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i16_stride2_vf16:
@@ -684,22 +684,22 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512-NEXT: vmovdqa %xmm3, 96(%rdx)
-; AVX512-NEXT: vmovdqa %xmm6, 112(%rdx)
; AVX512-NEXT: vmovdqa %xmm2, 64(%rdx)
-; AVX512-NEXT: vmovdqa %xmm5, 80(%rdx)
-; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx)
+; AVX512-NEXT: vmovdqa %xmm7, 80(%rdx)
+; AVX512-NEXT: vmovdqa %xmm3, 112(%rdx)
+; AVX512-NEXT: vmovdqa %xmm4, 96(%rdx)
; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm8, 16(%rdx)
+; AVX512-NEXT: vmovdqa %xmm5, 16(%rdx)
+; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512-NEXT: vmovdqa %xmm8, 48(%rdx)
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i16_stride2_vf32:
@@ -712,22 +712,22 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm6, 112(%rdx)
; AVX512-FCP-NEXT: vmovdqa %xmm2, 64(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm5, 80(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm7, 80(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm3, 112(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm4, 96(%rdx)
; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm8, 16(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm5, 16(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm8, 48(%rdx)
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i16_stride2_vf32:
@@ -740,22 +740,22 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512DQ-NEXT: vmovdqa %xmm3, 96(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm6, 112(%rdx)
; AVX512DQ-NEXT: vmovdqa %xmm2, 64(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm5, 80(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm7, 80(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm3, 112(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm4, 96(%rdx)
; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm8, 16(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm5, 16(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm8, 48(%rdx)
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i16_stride2_vf32:
@@ -768,22 +768,22 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 112(%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 80(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, 80(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 112(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 96(%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 16(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 16(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 48(%rdx)
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i16_stride2_vf32:
@@ -1075,214 +1075,214 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
;
; AVX512-LABEL: store_i16_stride2_vf64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa 64(%rsi), %xmm1
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 80(%rsi), %xmm1
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT: vmovdqa 80(%rsi), %xmm3
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-NEXT: vmovdqa 96(%rsi), %xmm5
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm6
+; AVX512-NEXT: vmovdqa 64(%rsi), %xmm3
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT: vmovdqa 112(%rsi), %xmm5
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm6
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512-NEXT: vmovdqa 112(%rsi), %xmm6
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm7
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512-NEXT: vmovdqa 96(%rsi), %xmm6
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512-NEXT: vmovdqa (%rsi), %xmm7
; AVX512-NEXT: vmovdqa 16(%rsi), %xmm9
; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10
; AVX512-NEXT: vmovdqa 48(%rsi), %xmm11
-; AVX512-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm13
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm14
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512-NEXT: vmovdqa %xmm9, 48(%rdx)
-; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
-; AVX512-NEXT: vmovdqa %xmm7, 16(%rdx)
-; AVX512-NEXT: vmovdqa %xmm14, (%rdx)
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
+; AVX512-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
+; AVX512-NEXT: vmovdqa %xmm7, (%rdx)
+; AVX512-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX512-NEXT: vmovdqa %xmm9, 32(%rdx)
+; AVX512-NEXT: vmovdqa %xmm13, 48(%rdx)
+; AVX512-NEXT: vmovdqa %xmm10, 64(%rdx)
+; AVX512-NEXT: vmovdqa %xmm14, 80(%rdx)
; AVX512-NEXT: vmovdqa %xmm11, 112(%rdx)
-; AVX512-NEXT: vmovdqa %xmm13, 96(%rdx)
-; AVX512-NEXT: vmovdqa %xmm10, 80(%rdx)
-; AVX512-NEXT: vmovdqa %xmm15, 64(%rdx)
-; AVX512-NEXT: vmovdqa %xmm6, 240(%rdx)
-; AVX512-NEXT: vmovdqa %xmm8, 224(%rdx)
-; AVX512-NEXT: vmovdqa %xmm5, 208(%rdx)
-; AVX512-NEXT: vmovdqa %xmm4, 192(%rdx)
-; AVX512-NEXT: vmovdqa %xmm3, 176(%rdx)
-; AVX512-NEXT: vmovdqa %xmm2, 160(%rdx)
-; AVX512-NEXT: vmovdqa %xmm1, 144(%rdx)
-; AVX512-NEXT: vmovdqa64 %xmm16, 128(%rdx)
+; AVX512-NEXT: vmovdqa %xmm15, 96(%rdx)
+; AVX512-NEXT: vmovdqa %xmm6, 192(%rdx)
+; AVX512-NEXT: vmovdqa %xmm8, 208(%rdx)
+; AVX512-NEXT: vmovdqa %xmm5, 240(%rdx)
+; AVX512-NEXT: vmovdqa %xmm4, 224(%rdx)
+; AVX512-NEXT: vmovdqa %xmm3, 128(%rdx)
+; AVX512-NEXT: vmovdqa %xmm2, 144(%rdx)
+; AVX512-NEXT: vmovdqa %xmm1, 176(%rdx)
+; AVX512-NEXT: vmovdqa64 %xmm16, 160(%rdx)
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i16_stride2_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm1
+; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm3
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm5
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
+; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm3
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm5
+; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm6
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm6
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm7
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm6
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm7
; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm9
; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm10
; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm11
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm12
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm13
; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm14
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm12
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm9, 48(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm7, 16(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm14, (%rdx)
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
+; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm9, 32(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm13, 48(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm10, 64(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm14, 80(%rdx)
; AVX512-FCP-NEXT: vmovdqa %xmm11, 112(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm13, 96(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm10, 80(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm15, 64(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm6, 240(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm8, 224(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm5, 208(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm4, 192(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm3, 176(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm2, 160(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm1, 144(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, 128(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm15, 96(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm6, 192(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm8, 208(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm5, 240(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm4, 224(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm3, 128(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm2, 144(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm1, 176(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %xmm16, 160(%rdx)
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i16_stride2_vf64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm1
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm1
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm3
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm5
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm6
+; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm3
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm5
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm6
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm6
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm7
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm6
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7
; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm9
; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10
; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm11
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm13
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm14
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512DQ-NEXT: vmovdqa %xmm9, 48(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm7, 16(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm14, (%rdx)
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
+; AVX512DQ-NEXT: vmovdqa %xmm7, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm9, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm13, 48(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm10, 64(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm14, 80(%rdx)
; AVX512DQ-NEXT: vmovdqa %xmm11, 112(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm13, 96(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm10, 80(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm15, 64(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm6, 240(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm8, 224(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm5, 208(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm4, 192(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm3, 176(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm2, 160(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm1, 144(%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %xmm16, 128(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm15, 96(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm6, 192(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm8, 208(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm5, 240(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm4, 224(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm3, 128(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm2, 144(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm1, 176(%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %xmm16, 160(%rdx)
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i16_stride2_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm6
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm7
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm9
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm10
; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm11
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm12
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm13
; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm14
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, 48(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, 16(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, (%rdx)
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, 32(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, 48(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, 64(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, 80(%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, 112(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, 96(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, 80(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 240(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 224(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 208(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 192(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 176(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 160(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 144(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, 128(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, 96(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 192(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 208(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 240(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 224(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 128(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 144(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 176(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, 160(%rdx)
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i16_stride2_vf64:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index 39230b67d380f..7303f6124afcb 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -246,8 +246,8 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[2,1]
+; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1]
; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[0],xmm4[3]
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
index c15eff9141fff..052dc16e7cb1f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
@@ -254,9 +254,9 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm0[1],xmm1[1],zero,zero
; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm3[0]
+; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0]
; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,0]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
; AVX-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm0[3,0]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index 0fba7de803488..407b7313f05fe 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -339,9 +339,9 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovaps (%rdx), %xmm2
; AVX-NEXT: vmovaps (%rcx), %xmm3
; AVX-NEXT: vmovaps (%r8), %xmm4
-; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm3[0]
+; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0]
; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,0]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6
; AVX-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm0[1],xmm1[1],zero
; AVX-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm2[1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index bead2c94cf121..c34b5d2ed8c7c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -469,10 +469,10 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovaps (%r8), %xmm2
; AVX-NEXT: vmovaps (%r9), %xmm6
; AVX-NEXT: vmovaps (%r10), %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm7
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[2,1],ymm9[6,4],ymm7[6,5]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm2[2,1],ymm7[6,4],ymm2[6,5]
; AVX-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm3[2],xmm4[2]
; AVX-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm5[1],xmm0[1],zero
; AVX-NEXT: vinsertps {{.*#+}} xmm10 = xmm4[1],xmm10[1,2],zero
@@ -485,9 +485,9 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vunpckhps {{.*#+}} xmm10 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4],ymm9[5,6,7]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
+; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm5[0]
; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,0]
; AVX-NEXT: vbroadcastss 4(%rdi), %xmm4
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
index cf246e4ede089..cac06cfa74cfc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
@@ -460,12 +460,12 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
; AVX-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm5[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,0],xmm4[3,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
; AVX-NEXT: vmovaps %ymm0, 96(%rax)
; AVX-NEXT: vmovaps %ymm10, 64(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll
index 53a6d306ef84d..30be6c88514bf 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll
@@ -544,14 +544,14 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1
; AVX512-NEXT: vmovdqa (%rdi), %xmm2
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm4, 16(%rdx)
+; AVX512-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx)
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i8_stride2_vf32:
@@ -560,14 +560,14 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm4, 16(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i8_stride2_vf32:
@@ -576,14 +576,14 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm4, 16(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx)
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i8_stride2_vf32:
@@ -592,14 +592,14 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 16(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i8_stride2_vf32:
@@ -787,22 +787,22 @@ define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512-NEXT: vmovdqa %xmm3, 96(%rdx)
-; AVX512-NEXT: vmovdqa %xmm6, 112(%rdx)
; AVX512-NEXT: vmovdqa %xmm2, 64(%rdx)
-; AVX512-NEXT: vmovdqa %xmm5, 80(%rdx)
-; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx)
+; AVX512-NEXT: vmovdqa %xmm7, 80(%rdx)
+; AVX512-NEXT: vmovdqa %xmm3, 112(%rdx)
+; AVX512-NEXT: vmovdqa %xmm4, 96(%rdx)
; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm8, 16(%rdx)
+; AVX512-NEXT: vmovdqa %xmm5, 16(%rdx)
+; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512-NEXT: vmovdqa %xmm8, 48(%rdx)
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i8_stride2_vf64:
@@ -815,22 +815,22 @@ define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm6, 112(%rdx)
; AVX512-FCP-NEXT: vmovdqa %xmm2, 64(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm5, 80(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm7, 80(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm3, 112(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm4, 96(%rdx)
; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm8, 16(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm5, 16(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm8, 48(%rdx)
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i8_stride2_vf64:
@@ -843,22 +843,22 @@ define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512DQ-NEXT: vmovdqa %xmm3, 96(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm6, 112(%rdx)
; AVX512DQ-NEXT: vmovdqa %xmm2, 64(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm5, 80(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm7, 80(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm3, 112(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm4, 96(%rdx)
; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm8, 16(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm5, 16(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm8, 48(%rdx)
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i8_stride2_vf64:
@@ -871,22 +871,22 @@ define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 112(%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 80(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, 80(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 112(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 96(%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 16(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 16(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 48(%rdx)
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i8_stride2_vf64:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index 8802e8a779332..782a81be47603 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -1639,119 +1639,122 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX-LABEL: store_i8_stride3_vf64:
; AVX: # %bb.0:
-; AVX-NEXT: subq $24, %rsp
-; AVX-NEXT: vmovdqa (%rdi), %xmm7
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: vmovdqa (%rdi), %xmm8
; AVX-NEXT: vmovdqa 16(%rdi), %xmm9
; AVX-NEXT: vmovdqa 32(%rdi), %xmm6
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128]
-; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa (%rdx), %xmm3
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128]
+; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovdqa (%rdx), %xmm5
; AVX-NEXT: vmovdqa 16(%rdx), %xmm1
-; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm0
+; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8
+; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm4
+; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm7
; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10]
-; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm2
+; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm3
; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128]
-; AVX-NEXT: vmovdqa 16(%rsi), %xmm12
-; AVX-NEXT: vmovdqa 32(%rsi), %xmm13
-; AVX-NEXT: vmovdqa 48(%rsi), %xmm14
-; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm15
-; AVX-NEXT: vpor %xmm2, %xmm15, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovdqa 16(%rsi), %xmm13
+; AVX-NEXT: vmovdqa 32(%rsi), %xmm14
+; AVX-NEXT: vmovdqa 48(%rsi), %xmm15
+; AVX-NEXT: vpshufb %xmm11, %xmm15, %xmm12
+; AVX-NEXT: vpor %xmm3, %xmm12, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb %xmm10, %xmm6, %xmm6
-; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm15
-; AVX-NEXT: vpor %xmm6, %xmm15, %xmm0
+; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm12
+; AVX-NEXT: vpor %xmm6, %xmm12, %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb %xmm10, %xmm9, %xmm9
-; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm15
-; AVX-NEXT: vpor %xmm9, %xmm15, %xmm0
+; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm12
+; AVX-NEXT: vpor %xmm9, %xmm12, %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa (%rsi), %xmm15
-; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm7
-; AVX-NEXT: vpshufb %xmm11, %xmm15, %xmm10
-; AVX-NEXT: vpor %xmm7, %xmm10, %xmm0
+; AVX-NEXT: vmovdqa (%rsi), %xmm2
+; AVX-NEXT: vpshufb %xmm10, %xmm8, %xmm8
+; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm10
+; AVX-NEXT: vpor %xmm8, %xmm10, %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15]
-; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm6
-; AVX-NEXT: vmovdqa %xmm1, %xmm0
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15]
-; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm5
-; AVX-NEXT: vmovdqa 32(%rdx), %xmm11
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15]
-; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm4
-; AVX-NEXT: vmovdqa 48(%rdx), %xmm7
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
-; AVX-NEXT: vpshufb %xmm10, %xmm9, %xmm9
-; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4]
-; AVX-NEXT: vpor %xmm10, %xmm8, %xmm10
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpor %xmm2, %xmm14, %xmm14
-; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4]
-; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX-NEXT: vmovdqa %xmm0, %xmm10
+; AVX-NEXT: vmovdqa %xmm5, %xmm0
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15]
+; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm8
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
+; AVX-NEXT: vpshufb %xmm11, %xmm10, %xmm6
+; AVX-NEXT: vmovdqa 32(%rdx), %xmm12
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
+; AVX-NEXT: vpshufb %xmm11, %xmm10, %xmm5
+; AVX-NEXT: vmovdqa 48(%rdx), %xmm10
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
+; AVX-NEXT: vpshufb %xmm11, %xmm9, %xmm9
+; AVX-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4]
+; AVX-NEXT: vpor %xmm7, %xmm11, %xmm11
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4]
+; AVX-NEXT: vpor %xmm4, %xmm11, %xmm11
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; AVX-NEXT: vmovdqa %xmm1, %xmm11
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX-NEXT: vpor %xmm1, %xmm14, %xmm14
-; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
-; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4]
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
+; AVX-NEXT: vmovdqa %xmm0, %xmm3
+; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVX-NEXT: vpor %xmm0, %xmm14, %xmm14
-; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
-; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
-; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr {{.*#+}} xmm15 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
+; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX-NEXT: vpalignr {{.*#+}} xmm15 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6
+; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm6
; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128]
-; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm6, %xmm1
+; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm2
+; AVX-NEXT: vpor %xmm2, %xmm6, %xmm2
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6
-; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm3
-; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3
+; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm6
+; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm4
+; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6
-; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm11
-; AVX-NEXT: vpor %xmm6, %xmm11, %xmm6
-; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm14
-; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm4
-; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm12
-; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2
-; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm13
-; AVX-NEXT: vpshufb %xmm11, %xmm15, %xmm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm10
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm8
-; AVX-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm5
+; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm6
+; AVX-NEXT: vpshufb %xmm9, %xmm12, %xmm12
+; AVX-NEXT: vpor %xmm6, %xmm12, %xmm6
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm8
+; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm9
+; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm0
+; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm5
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm10
+; AVX-NEXT: vpshufb %xmm9, %xmm15, %xmm3
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm11
+; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm1
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm12
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm7
-; AVX-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX-NEXT: vmovdqa %xmm3, 64(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 80(%rcx)
-; AVX-NEXT: vmovdqa %xmm14, (%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm4, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm12, 48(%rcx)
-; AVX-NEXT: vmovdqa %xmm5, 160(%rcx)
-; AVX-NEXT: vmovdqa %xmm8, 176(%rcx)
-; AVX-NEXT: vmovdqa %xmm13, 96(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 80(%rcx)
+; AVX-NEXT: vmovdqa %xmm4, 64(%rcx)
+; AVX-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm5, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm10, 48(%rcx)
+; AVX-NEXT: vmovdqa %xmm7, 176(%rcx)
+; AVX-NEXT: vmovdqa %xmm8, 160(%rcx)
+; AVX-NEXT: vmovdqa %xmm11, 96(%rcx)
; AVX-NEXT: vmovdqa %xmm6, 112(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 128(%rcx)
-; AVX-NEXT: vmovdqa %xmm10, 144(%rcx)
-; AVX-NEXT: addq $24, %rsp
+; AVX-NEXT: vmovdqa %xmm1, 128(%rcx)
+; AVX-NEXT: vmovdqa %xmm12, 144(%rcx)
+; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: retq
;
; AVX2-LABEL: store_i8_stride3_vf64:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index bc83cc1cab42d..7ad9fb0c27170 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -3595,41 +3595,73 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512F-SLOW: # %bb.0:
+; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-SLOW-NEXT: vzeroupper
+; AVX512F-SLOW-NEXT: retq
;
-; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512F-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512F-FAST: # %bb.0:
+; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2
+; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
+; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512F-FAST-NEXT: vzeroupper
+; AVX512F-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
+; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX512BW-SLOW: # %bb.0:
@@ -3757,10 +3789,10 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
;
; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX512F-SLOW: # %bb.0:
-; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -3796,10 +3828,10 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
;
; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX512DQ-SLOW: # %bb.0:
-; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -3987,11 +4019,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512F-SLOW: # %bb.0:
-; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -4006,29 +4037,27 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512F-FAST: # %bb.0:
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512F-FAST-NEXT: vpbroadcastw %xmm1, %ymm2
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15]
-; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero
-; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2
+; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
+; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
+; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx)
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512DQ-SLOW: # %bb.0:
-; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -4043,19 +4072,18 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512DQ-FAST: # %bb.0:
-; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm1, %ymm2
-; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15]
-; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
+; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx)
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
;
@@ -4188,10 +4216,10 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
;
; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX512F-SLOW: # %bb.0:
-; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -4227,10 +4255,10 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
;
; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX512DQ-SLOW: # %bb.0:
-; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
More information about the llvm-commits
mailing list