[llvm] 8aacbfc - [X86] combineEXTRACT_SUBVECTOR - treat oneuse extractions from loads as free
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 14 10:24:19 PDT 2024
Author: Simon Pilgrim
Date: 2024-08-14T18:23:07+01:00
New Revision: 8aacbfcb095bd37b6444a9fa074301d733555374
URL: https://github.com/llvm/llvm-project/commit/8aacbfcb095bd37b6444a9fa074301d733555374
DIFF: https://github.com/llvm/llvm-project/commit/8aacbfcb095bd37b6444a9fa074301d733555374.diff
LOG: [X86] combineEXTRACT_SUBVECTOR - treat oneuse extractions from loads as free
Allows further reductions in instruction vector widths
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7249789a8c0748..137519af15d760 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57140,6 +57140,11 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
}
auto IsExtractFree = [](SDValue V) {
+ if (V.hasOneUse()) {
+ V = peekThroughOneUseBitcasts(V);
+ if (V.getOpcode() == ISD::LOAD)
+ return true;
+ }
V = peekThroughBitcasts(V);
if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
return true;
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 73edceb3c3ede3..add0592661db67 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -2749,12 +2749,9 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64>
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
-; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
+; CHECK-NEXT: vmovdqa 48(%rdi), %xmm2
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
-; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
; CHECK-NEXT: retq
%vec = load <8 x i64>, ptr %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
@@ -2766,12 +2763,9 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
-; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
-; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
+; CHECK-NEXT: vmovdqa 48(%rdi), %xmm1
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
; CHECK-NEXT: retq
%vec = load <8 x i64>, ptr %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index b40b2c82843ccd..90c1d42a929c81 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -2401,109 +2401,106 @@ define void @D107009(ptr %input, ptr %output) {
; AVX1-LABEL: D107009:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovups 96(%rdi), %ymm0
-; AVX1-NEXT: vmovups (%rdi), %ymm1
-; AVX1-NEXT: vmovups 128(%rdi), %ymm2
-; AVX1-NEXT: vmovups 224(%rdi), %ymm3
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2]
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
-; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4]
+; AVX1-NEXT: vmovups 128(%rdi), %ymm1
+; AVX1-NEXT: vmovups 224(%rdi), %ymm2
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4]
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
-; AVX1-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,3,3,3,7,7,7,7]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2]
+; AVX1-NEXT: vmovshdup {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi)
-; AVX1-NEXT: vmovdqa %xmm7, 112(%rsi)
-; AVX1-NEXT: vmovdqa %xmm6, 48(%rsi)
+; AVX1-NEXT: vmovdqa %xmm7, 48(%rsi)
+; AVX1-NEXT: vmovdqa %xmm6, 112(%rsi)
; AVX1-NEXT: vmovups %ymm1, 128(%rsi)
-; AVX1-NEXT: vmovupd %ymm5, 192(%rsi)
-; AVX1-NEXT: vmovups %ymm4, 224(%rsi)
-; AVX1-NEXT: vmovups %ymm3, 160(%rsi)
+; AVX1-NEXT: vmovups %ymm5, 160(%rsi)
+; AVX1-NEXT: vmovupd %ymm4, 192(%rsi)
+; AVX1-NEXT: vmovupd %ymm3, 224(%rsi)
; AVX1-NEXT: vmovups %ymm2, 64(%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: D107009:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; AVX2-NEXT: vmovdqu 64(%rdi), %ymm1
-; AVX2-NEXT: vmovdqu 128(%rdi), %ymm2
-; AVX2-NEXT: vmovdqu 192(%rdi), %ymm3
-; AVX2-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5]
+; AVX2-NEXT: vmovdqu 64(%rdi), %ymm0
+; AVX2-NEXT: vmovdqu 128(%rdi), %ymm1
+; AVX2-NEXT: vmovdqu 192(%rdi), %ymm2
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,1,1,1,5,5,5,5]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[3,3,3,3,7,7,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; AVX2-NEXT: vpbroadcastd %xmm0, %ymm4
+; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,3,3,3,7,7,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,1,1,1,5,5,5,5]
; AVX2-NEXT: vmovdqu %ymm0, 128(%rsi)
-; AVX2-NEXT: vmovdqu %ymm7, 192(%rsi)
-; AVX2-NEXT: vmovdqu %ymm6, 224(%rsi)
-; AVX2-NEXT: vmovdqu %ymm5, 160(%rsi)
-; AVX2-NEXT: vmovdqu %ymm4, 64(%rsi)
-; AVX2-NEXT: vmovdqa %xmm3, 112(%rsi)
-; AVX2-NEXT: vmovdqu %ymm2, (%rsi)
-; AVX2-NEXT: vmovdqa %xmm1, 48(%rsi)
+; AVX2-NEXT: vmovdqu %ymm7, 160(%rsi)
+; AVX2-NEXT: vmovdqu %ymm6, 192(%rsi)
+; AVX2-NEXT: vmovdqu %ymm5, 224(%rsi)
+; AVX2-NEXT: vmovdqu %ymm4, (%rsi)
+; AVX2-NEXT: vmovdqa %xmm3, 48(%rsi)
+; AVX2-NEXT: vmovdqa %xmm2, 112(%rsi)
+; AVX2-NEXT: vmovdqu %ymm1, 64(%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; XOP-LABEL: D107009:
; XOP: # %bb.0:
; XOP-NEXT: vmovups 96(%rdi), %ymm0
-; XOP-NEXT: vmovups (%rdi), %ymm1
-; XOP-NEXT: vmovups 128(%rdi), %ymm2
-; XOP-NEXT: vmovups 224(%rdi), %ymm3
-; XOP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2]
-; XOP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5]
-; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4]
+; XOP-NEXT: vmovups 128(%rdi), %ymm1
+; XOP-NEXT: vmovups 224(%rdi), %ymm2
+; XOP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
; XOP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
-; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
+; XOP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4]
+; XOP-NEXT: vmovdqa 16(%rdi), %xmm2
+; XOP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
; XOP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0
-; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; XOP-NEXT: vpsrld $16, %xmm0, %xmm0
-; XOP-NEXT: vextractf128 $1, %ymm2, %xmm1
+; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
; XOP-NEXT: vpsrld $16, %xmm1, %xmm1
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOP-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
-; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
-; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
-; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
-; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
+; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,3,3,3,7,7,7,7]
+; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2]
+; XOP-NEXT: vmovshdup {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7]
+; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
+; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
; XOP-NEXT: vmovdqa %xmm0, 16(%rsi)
-; XOP-NEXT: vmovdqa %xmm7, 112(%rsi)
-; XOP-NEXT: vmovdqa %xmm6, 48(%rsi)
+; XOP-NEXT: vmovdqa %xmm7, 48(%rsi)
+; XOP-NEXT: vmovdqa %xmm6, 112(%rsi)
; XOP-NEXT: vmovups %ymm1, 128(%rsi)
-; XOP-NEXT: vmovupd %ymm5, 192(%rsi)
-; XOP-NEXT: vmovups %ymm4, 224(%rsi)
-; XOP-NEXT: vmovups %ymm3, 160(%rsi)
+; XOP-NEXT: vmovups %ymm5, 160(%rsi)
+; XOP-NEXT: vmovupd %ymm4, 192(%rsi)
+; XOP-NEXT: vmovupd %ymm3, 224(%rsi)
; XOP-NEXT: vmovups %ymm2, 64(%rsi)
; XOP-NEXT: vzeroupper
; XOP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
index f27619738a0eab..70164cff890729 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
@@ -57,222 +57,211 @@ define void @load_i64_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-LABEL: load_i64_stride5_vf2:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps (%rdi), %ymm0
-; AVX2-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm2[2,3]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-NEXT: vmovdqa %xmm5, (%rsi)
-; AVX2-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX2-NEXT: vextractf128 $1, %ymm0, (%rcx)
-; AVX2-NEXT: vmovdqa %xmm3, (%r8)
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
+; AVX2-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX2-NEXT: vmovaps %xmm1, (%rcx)
+; AVX2-NEXT: vmovdqa %xmm5, (%r8)
; AVX2-NEXT: vmovdqa %xmm2, (%r9)
-; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i64_stride5_vf2:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
-; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm2[2,3]
-; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
-; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-FP-NEXT: vmovdqa %xmm5, (%rsi)
-; AVX2-FP-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX2-FP-NEXT: vextractf128 $1, %ymm0, (%rcx)
-; AVX2-FP-NEXT: vmovdqa %xmm3, (%r8)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
+; AVX2-FP-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX2-FP-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX2-FP-NEXT: vmovaps %xmm1, (%rcx)
+; AVX2-FP-NEXT: vmovdqa %xmm5, (%r8)
; AVX2-FP-NEXT: vmovdqa %xmm2, (%r9)
-; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i64_stride5_vf2:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm2[2,3]
-; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
-; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rsi)
-; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, (%rcx)
-; AVX2-FCP-NEXT: vmovdqa %xmm3, (%r8)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
+; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX2-FCP-NEXT: vmovaps %xmm1, (%rcx)
+; AVX2-FCP-NEXT: vmovdqa %xmm5, (%r8)
; AVX2-FCP-NEXT: vmovdqa %xmm2, (%r9)
-; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i64_stride5_vf2:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vmovaps (%rdi), %ymm2
-; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
; AVX512-NEXT: vmovdqa %xmm4, (%rsi)
; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-NEXT: vextractf128 $1, %ymm2, (%rcx)
-; AVX512-NEXT: vmovdqa %xmm5, (%r8)
-; AVX512-NEXT: vmovdqa %xmm1, (%r9)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512-NEXT: vmovdqa %xmm2, (%r9)
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i64_stride5_vf2:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm2
-; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rsi)
; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-FCP-NEXT: vextractf128 $1, %ymm2, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %xmm1, (%r9)
-; AVX512-FCP-NEXT: vzeroupper
+; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %xmm2, (%r9)
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i64_stride5_vf2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovaps (%rdi), %ymm2
-; AVX512DQ-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
; AVX512DQ-NEXT: vmovdqa %xmm4, (%rsi)
; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-NEXT: vextractf128 $1, %ymm2, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8)
-; AVX512DQ-NEXT: vmovdqa %xmm1, (%r9)
-; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512DQ-NEXT: vmovdqa %xmm2, (%r9)
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i64_stride5_vf2:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm2, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%r9)
-; AVX512DQ-FCP-NEXT: vzeroupper
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%r9)
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i64_stride5_vf2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vmovaps (%rdi), %ymm2
-; AVX512BW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX512BW-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi)
; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512BW-NEXT: vextractf128 $1, %ymm2, (%rcx)
-; AVX512BW-NEXT: vmovdqa %xmm5, (%r8)
-; AVX512BW-NEXT: vmovdqa %xmm1, (%r9)
-; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512BW-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512BW-NEXT: vmovdqa %xmm2, (%r9)
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i64_stride5_vf2:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm2
-; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
+; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm2, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%r9)
-; AVX512BW-FCP-NEXT: vzeroupper
+; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r9)
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i64_stride5_vf2:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-BW-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm2
-; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
+; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rsi)
; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm2, (%rcx)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%r9)
-; AVX512DQ-BW-NEXT: vzeroupper
+; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%r9)
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i64_stride5_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vzeroupper
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r9)
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <10 x i64>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <10 x i64> %wide.vec, <10 x i64> poison, <2 x i32> <i32 0, i32 5>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
index 8ca0e0cb971861..89642492f83a85 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
@@ -6487,7 +6487,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vmovaps (%rdi), %ymm0
; AVX2-NEXT: vmovaps (%rsi), %ymm1
; AVX2-NEXT: vmovaps (%rdx), %ymm2
-; AVX2-NEXT: vmovaps (%r8), %ymm13
+; AVX2-NEXT: vmovaps (%r8), %ymm12
; AVX2-NEXT: vmovaps (%r9), %ymm9
; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps (%rsi), %xmm3
@@ -6517,9 +6517,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1]
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm9[0],ymm13[2],ymm9[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2]
+; AVX2-NEXT: vmovaps 16(%rdx), %xmm6
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps 32(%rax), %xmm4
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
@@ -6686,8 +6687,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
; AVX2-NEXT: vmovaps 160(%rax), %xmm2
; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT: vmovaps 160(%rdi), %xmm12
-; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3
+; AVX2-NEXT: vmovaps 160(%rdi), %xmm13
+; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps 160(%rdx), %xmm11
@@ -6706,9 +6707,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps 160(%rdx), %ymm0
-; AVX2-NEXT: vmovaps 160(%r8), %ymm9
-; AVX2-NEXT: vmovaps 160(%r9), %ymm8
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX2-NEXT: vmovaps 160(%r8), %ymm10
+; AVX2-NEXT: vmovaps 160(%r9), %ymm9
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -6743,8 +6744,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps 192(%r8), %ymm1
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3]
+; AVX2-NEXT: vmovaps 208(%rdx), %xmm6
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -6770,8 +6772,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps 224(%rsi), %ymm3
; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3]
@@ -6785,8 +6787,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm3 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
; AVX2-NEXT: vbroadcastsd 24(%rcx), %ymm6
@@ -6824,14 +6826,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0]
; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload
; AVX2-NEXT: vbroadcastsd 96(%rcx), %ymm6
-; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7]
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX2-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
; AVX2-NEXT: vbroadcastsd 120(%rcx), %ymm6
-; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2,3,4,5,6,7]
; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0]
; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload
@@ -6843,15 +6845,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
; AVX2-NEXT: vbroadcastsd 152(%rcx), %ymm15
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11
-; AVX2-NEXT: vbroadcastsd 160(%rcx), %ymm12
-; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm9
-; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7]
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11
+; AVX2-NEXT: vbroadcastsd 160(%rcx), %ymm13
+; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm10
+; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7]
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0]
; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
; AVX2-NEXT: vbroadcastsd %xmm4, %ymm4
@@ -6874,15 +6876,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vbroadcastsd 248(%r9), %ymm7
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
; AVX2-NEXT: vmovaps 224(%rax), %ymm7
-; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
+; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm10 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7]
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7]
; AVX2-NEXT: vmovaps %ymm2, 1760(%rcx)
; AVX2-NEXT: vmovaps %ymm14, 1728(%rcx)
; AVX2-NEXT: vmovaps %ymm0, 1696(%rcx)
-; AVX2-NEXT: vmovaps %ymm9, 1664(%rcx)
+; AVX2-NEXT: vmovaps %ymm10, 1664(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 1632(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -6900,7 +6902,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vmovaps %ymm4, 1376(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 1344(%rcx)
-; AVX2-NEXT: vmovaps %ymm8, 1312(%rcx)
+; AVX2-NEXT: vmovaps %ymm9, 1312(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 1280(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -6924,7 +6926,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vmovaps %ymm6, 928(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 896(%rcx)
-; AVX2-NEXT: vmovaps %ymm10, 864(%rcx)
+; AVX2-NEXT: vmovaps %ymm8, 864(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 832(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -6935,7 +6937,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vmovaps %ymm0, 736(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 704(%rcx)
-; AVX2-NEXT: vmovaps %ymm13, 672(%rcx)
+; AVX2-NEXT: vmovaps %ymm12, 672(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 640(%rcx)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -6989,7 +6991,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1
; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2
-; AVX2-FP-NEXT: vmovaps (%r8), %ymm13
+; AVX2-FP-NEXT: vmovaps (%r8), %ymm12
; AVX2-FP-NEXT: vmovaps (%r9), %ymm9
; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovaps (%rsi), %xmm3
@@ -7019,9 +7021,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1]
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm9[0],ymm13[2],ymm9[2]
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2]
-; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3]
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2]
+; AVX2-FP-NEXT: vmovaps 16(%rdx), %xmm6
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm4
; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
@@ -7188,8 +7191,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
; AVX2-FP-NEXT: vmovaps 160(%rax), %xmm2
; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm12
-; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3
+; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm13
+; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovaps 160(%rdx), %xmm11
@@ -7208,9 +7211,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm0
-; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm9
-; AVX2-FP-NEXT: vmovaps 160(%r9), %ymm8
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm10
+; AVX2-FP-NEXT: vmovaps 160(%r9), %ymm9
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7245,8 +7248,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm1
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3]
+; AVX2-FP-NEXT: vmovaps 208(%rdx), %xmm6
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -7272,8 +7276,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm3
; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0]
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3]
@@ -7287,8 +7291,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm3 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3]
; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3]
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
; AVX2-FP-NEXT: vbroadcastsd 24(%rcx), %ymm6
@@ -7326,14 +7330,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0]
; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload
; AVX2-FP-NEXT: vbroadcastsd 96(%rcx), %ymm6
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7]
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3]
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
; AVX2-FP-NEXT: vbroadcastsd 120(%rcx), %ymm6
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2,3,4,5,6,7]
; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0]
; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload
@@ -7345,15 +7349,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
; AVX2-FP-NEXT: vbroadcastsd 152(%rcx), %ymm15
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11
-; AVX2-FP-NEXT: vbroadcastsd 160(%rcx), %ymm12
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3]
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm9
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7]
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
+; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11
+; AVX2-FP-NEXT: vbroadcastsd 160(%rcx), %ymm13
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
+; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm10
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7]
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0]
; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
; AVX2-FP-NEXT: vbroadcastsd %xmm4, %ymm4
@@ -7376,15 +7380,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vbroadcastsd 248(%r9), %ymm7
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FP-NEXT: vmovaps 224(%rax), %ymm7
-; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm10 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7]
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7]
; AVX2-FP-NEXT: vmovaps %ymm2, 1760(%rcx)
; AVX2-FP-NEXT: vmovaps %ymm14, 1728(%rcx)
; AVX2-FP-NEXT: vmovaps %ymm0, 1696(%rcx)
-; AVX2-FP-NEXT: vmovaps %ymm9, 1664(%rcx)
+; AVX2-FP-NEXT: vmovaps %ymm10, 1664(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm0, 1632(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7402,7 +7406,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vmovaps %ymm4, 1376(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm0, 1344(%rcx)
-; AVX2-FP-NEXT: vmovaps %ymm8, 1312(%rcx)
+; AVX2-FP-NEXT: vmovaps %ymm9, 1312(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7426,7 +7430,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vmovaps %ymm6, 928(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm0, 896(%rcx)
-; AVX2-FP-NEXT: vmovaps %ymm10, 864(%rcx)
+; AVX2-FP-NEXT: vmovaps %ymm8, 864(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm0, 832(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7437,7 +7441,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vmovaps %ymm0, 736(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm0, 704(%rcx)
-; AVX2-FP-NEXT: vmovaps %ymm13, 672(%rcx)
+; AVX2-FP-NEXT: vmovaps %ymm12, 672(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm0, 640(%rcx)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7491,7 +7495,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1
; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2
-; AVX2-FCP-NEXT: vmovaps (%r8), %ymm13
+; AVX2-FCP-NEXT: vmovaps (%r8), %ymm12
; AVX2-FCP-NEXT: vmovaps (%r9), %ymm9
; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm3
@@ -7521,9 +7525,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm9[0],ymm13[2],ymm9[2]
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2]
-; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3]
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2]
+; AVX2-FCP-NEXT: vmovaps 16(%rdx), %xmm6
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm4
; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
@@ -7690,8 +7695,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
; AVX2-FCP-NEXT: vmovaps 160(%rax), %xmm2
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm12
-; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm13
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovaps 160(%rdx), %xmm11
@@ -7710,9 +7715,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm0
-; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm9
-; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm8
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm10
+; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm9
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7747,8 +7752,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm1
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3]
+; AVX2-FCP-NEXT: vmovaps 208(%rdx), %xmm6
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -7774,8 +7780,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm3
; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0]
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3]
@@ -7789,8 +7795,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
; AVX2-FCP-NEXT: vbroadcastsd 24(%rcx), %ymm6
@@ -7828,14 +7834,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0]
; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vbroadcastsd 96(%rcx), %ymm6
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
; AVX2-FCP-NEXT: vbroadcastsd 120(%rcx), %ymm6
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2,3,4,5,6,7]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0]
; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload
@@ -7847,15 +7853,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
; AVX2-FCP-NEXT: vbroadcastsd 152(%rcx), %ymm15
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11
-; AVX2-FCP-NEXT: vbroadcastsd 160(%rcx), %ymm12
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm9
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11
+; AVX2-FCP-NEXT: vbroadcastsd 160(%rcx), %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm10
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7]
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
; AVX2-FCP-NEXT: vbroadcastsd %xmm4, %ymm4
@@ -7878,15 +7884,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vbroadcastsd 248(%r9), %ymm7
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FCP-NEXT: vmovaps 224(%rax), %ymm7
-; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm10 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7]
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7]
; AVX2-FCP-NEXT: vmovaps %ymm2, 1760(%rcx)
; AVX2-FCP-NEXT: vmovaps %ymm14, 1728(%rcx)
; AVX2-FCP-NEXT: vmovaps %ymm0, 1696(%rcx)
-; AVX2-FCP-NEXT: vmovaps %ymm9, 1664(%rcx)
+; AVX2-FCP-NEXT: vmovaps %ymm10, 1664(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 1632(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7904,7 +7910,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovaps %ymm4, 1376(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 1344(%rcx)
-; AVX2-FCP-NEXT: vmovaps %ymm8, 1312(%rcx)
+; AVX2-FCP-NEXT: vmovaps %ymm9, 1312(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 1280(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7928,7 +7934,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovaps %ymm6, 928(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 896(%rcx)
-; AVX2-FCP-NEXT: vmovaps %ymm10, 864(%rcx)
+; AVX2-FCP-NEXT: vmovaps %ymm8, 864(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 832(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7939,7 +7945,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovaps %ymm0, 736(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%rcx)
-; AVX2-FCP-NEXT: vmovaps %ymm13, 672(%rcx)
+; AVX2-FCP-NEXT: vmovaps %ymm12, 672(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%rcx)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -13822,8 +13828,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX2-NEXT: vmovaps 16(%rdx), %xmm4
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovaps 16(%rax), %xmm3
; AVX2-NEXT: vmovaps 32(%rax), %xmm4
@@ -14055,13 +14062,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 192(%r8), %ymm4
+; AVX2-NEXT: vmovaps 208(%rdx), %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT: vmovaps 192(%r8), %ymm5
+; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 192(%r9), %ymm4
; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 192(%r9), %ymm1
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -14135,13 +14143,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 256(%r8), %ymm4
+; AVX2-NEXT: vmovaps 272(%rdx), %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT: vmovaps 256(%r8), %ymm5
+; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 256(%r9), %ymm4
; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 256(%r9), %ymm1
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -14214,12 +14223,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 336(%rdx), %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX2-NEXT: vmovaps 320(%r8), %ymm9
-; AVX2-NEXT: vmovaps 320(%r9), %ymm1
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-NEXT: vmovaps 320(%r9), %ymm4
+; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -14291,12 +14301,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 400(%rdx), %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX2-NEXT: vmovaps 384(%r8), %ymm15
-; AVX2-NEXT: vmovaps 384(%r9), %ymm1
-; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2]
-; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-NEXT: vmovaps 384(%r9), %ymm4
+; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm4[0],ymm15[2],ymm4[2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -14862,8 +14873,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2]
-; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX2-FP-NEXT: vmovaps 16(%rdx), %xmm4
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovaps 16(%rax), %xmm3
; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm4
@@ -15095,13 +15107,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm4
+; AVX2-FP-NEXT: vmovaps 208(%rdx), %xmm1
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm5
+; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovaps 192(%r9), %ymm4
; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovaps 192(%r9), %ymm1
-; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -15175,13 +15188,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovaps 256(%r8), %ymm4
+; AVX2-FP-NEXT: vmovaps 272(%rdx), %xmm1
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-FP-NEXT: vmovaps 256(%r8), %ymm5
+; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovaps 256(%r9), %ymm4
; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovaps 256(%r9), %ymm1
-; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -15254,12 +15268,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovaps 336(%rdx), %xmm1
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX2-FP-NEXT: vmovaps 320(%r8), %ymm9
-; AVX2-FP-NEXT: vmovaps 320(%r9), %ymm1
-; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-FP-NEXT: vmovaps 320(%r9), %ymm4
+; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -15331,12 +15346,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovaps 400(%rdx), %xmm1
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX2-FP-NEXT: vmovaps 384(%r8), %ymm15
-; AVX2-FP-NEXT: vmovaps 384(%r9), %ymm1
-; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2]
-; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-FP-NEXT: vmovaps 384(%r9), %ymm4
+; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm4[0],ymm15[2],ymm4[2]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -15902,8 +15918,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2]
-; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX2-FCP-NEXT: vmovaps 16(%rdx), %xmm4
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm3
; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm4
@@ -16135,13 +16152,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm4
+; AVX2-FCP-NEXT: vmovaps 208(%rdx), %xmm1
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm5
+; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm4
; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm1
-; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -16215,13 +16233,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 256(%r8), %ymm4
+; AVX2-FCP-NEXT: vmovaps 272(%rdx), %xmm1
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-FCP-NEXT: vmovaps 256(%r8), %ymm5
+; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 256(%r9), %ymm4
; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 256(%r9), %ymm1
-; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -16294,12 +16313,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 336(%rdx), %xmm1
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX2-FCP-NEXT: vmovaps 320(%r8), %ymm9
-; AVX2-FCP-NEXT: vmovaps 320(%r9), %ymm1
-; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-FCP-NEXT: vmovaps 320(%r9), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
@@ -16371,12 +16391,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 400(%rdx), %xmm1
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX2-FCP-NEXT: vmovaps 384(%r8), %ymm15
-; AVX2-FCP-NEXT: vmovaps 384(%r9), %ymm1
-; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2]
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-FCP-NEXT: vmovaps 384(%r9), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm4[0],ymm15[2],ymm4[2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index cb038b3211abd1..49947eddc61b9d 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1890,19 +1890,11 @@ define void @splat4_v4i64_load_store(ptr %s, ptr %d) nounwind {
}
define <2 x i64> @PR37616(ptr %a0) nounwind {
-; AVX1-LABEL: PR37616:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps 16(%rdi), %xmm0
-; AVX1-NEXT: vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
-; AVX1-NEXT: retq
-;
-; AVX2OR512-LABEL: PR37616:
-; AVX2OR512: # %bb.0:
-; AVX2OR512-NEXT: vmovaps (%rdi), %ymm0
-; AVX2OR512-NEXT: vunpcklpd 32(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2OR512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2OR512-NEXT: vzeroupper
-; AVX2OR512-NEXT: retq
+; AVX-LABEL: PR37616:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps 16(%rdi), %xmm0
+; AVX-NEXT: vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: retq
%load = load <16 x i64>, ptr %a0, align 128
%shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> <i32 2, i32 6>
ret <2 x i64> %shuffle
More information about the llvm-commits
mailing list