[llvm] [DAG] Retain original alignment in bitcast(load(x)) -> load(x) fold (PR #75922)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 19 03:48:10 PST 2023
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/75922
Pulled out of #75626 to allow it focus on atomic loads
>From 1f6f2f89a17f3a5037feaee5a31640645c92e3f8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 19 Dec 2023 11:46:56 +0000
Subject: [PATCH] [DAG] Retain original alignment in bitcast(load(x)) ->
load(x) fold
Pulled out of #75626 to allow it focus on atomic loads
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
...d_vector_inreg_of_broadcast_from_memory.ll | 8 +-
.../vector-interleaved-load-i16-stride-2.ll | 160 +++++++-------
.../vector-interleaved-load-i16-stride-4.ll | 204 +++++++++---------
.../vector-interleaved-load-i32-stride-2.ll | 90 ++++----
.../vector-interleaved-store-i64-stride-4.ll | 32 +--
.../vector-interleaved-store-i64-stride-5.ll | 12 +-
.../vector-interleaved-store-i64-stride-8.ll | 10 +-
8 files changed, 260 insertions(+), 258 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5b7629fc8cbe83..3f73eb3a3424a3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15165,7 +15165,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
*LN0->getMemOperand())) {
SDValue Load =
DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
- LN0->getPointerInfo(), LN0->getAlign(),
+ LN0->getPointerInfo(), LN0->getOriginalAlign(),
LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
return Load;
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 6d5f8a78cb1d70..99d9f6b41e70dd 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3647,10 +3647,10 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vzeroupper
@@ -3833,10 +3833,10 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index eeea912a56a69a..04fd6d9300c18d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -501,136 +501,138 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; SSE-LABEL: load_i16_stride2_vf64:
; SSE: # %bb.0:
; SSE-NEXT: subq $40, %rsp
-; SSE-NEXT: movdqa 96(%rdi), %xmm13
-; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 112(%rdi), %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 128(%rdi), %xmm11
-; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 144(%rdi), %xmm2
+; SSE-NEXT: movdqa 160(%rdi), %xmm14
+; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 176(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 160(%rdi), %xmm10
-; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill
-; SSE-NEXT: movdqa 176(%rdi), %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa (%rdi), %xmm9
-; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rdi), %xmm1
+; SSE-NEXT: movdqa 64(%rdi), %xmm11
+; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 80(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 32(%rdi), %xmm12
-; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 48(%rdi), %xmm14
-; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: movdqa 96(%rdi), %xmm9
+; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 112(%rdi), %xmm4
+; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
+; SSE-NEXT: movdqa (%rdi), %xmm10
+; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 16(%rdi), %xmm7
+; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 32(%rdi), %xmm13
+; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 48(%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: pslld $16, %xmm12
-; SSE-NEXT: psrad $16, %xmm12
-; SSE-NEXT: packssdw %xmm0, %xmm12
-; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: pslld $16, %xmm13
+; SSE-NEXT: psrad $16, %xmm13
+; SSE-NEXT: packssdw %xmm0, %xmm13
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: pslld $16, %xmm10
; SSE-NEXT: psrad $16, %xmm10
; SSE-NEXT: packssdw %xmm0, %xmm10
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm4, %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: pslld $16, %xmm9
; SSE-NEXT: psrad $16, %xmm9
; SSE-NEXT: packssdw %xmm0, %xmm9
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: pslld $16, %xmm11
; SSE-NEXT: psrad $16, %xmm11
; SSE-NEXT: packssdw %xmm0, %xmm11
-; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: pslld $16, %xmm13
-; SSE-NEXT: psrad $16, %xmm13
-; SSE-NEXT: packssdw %xmm0, %xmm13
-; SSE-NEXT: movdqa 240(%rdi), %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pslld $16, %xmm14
+; SSE-NEXT: psrad $16, %xmm14
+; SSE-NEXT: packssdw %xmm0, %xmm14
+; SSE-NEXT: movdqa 144(%rdi), %xmm7
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: movdqa 224(%rdi), %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm15
+; SSE-NEXT: movdqa 128(%rdi), %xmm8
+; SSE-NEXT: movdqa %xmm8, %xmm15
; SSE-NEXT: pslld $16, %xmm15
; SSE-NEXT: psrad $16, %xmm15
; SSE-NEXT: packssdw %xmm0, %xmm15
-; SSE-NEXT: movdqa 80(%rdi), %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: movdqa 240(%rdi), %xmm12
+; SSE-NEXT: movdqa %xmm12, %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: movdqa 64(%rdi), %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm4
+; SSE-NEXT: movdqa 224(%rdi), %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: pslld $16, %xmm3
+; SSE-NEXT: psrad $16, %xmm3
+; SSE-NEXT: packssdw %xmm1, %xmm3
+; SSE-NEXT: movdqa 208(%rdi), %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm4
; SSE-NEXT: pslld $16, %xmm4
; SSE-NEXT: psrad $16, %xmm4
-; SSE-NEXT: packssdw %xmm1, %xmm4
-; SSE-NEXT: movdqa 208(%rdi), %xmm8
-; SSE-NEXT: movdqa %xmm8, %xmm6
-; SSE-NEXT: pslld $16, %xmm6
-; SSE-NEXT: psrad $16, %xmm6
; SSE-NEXT: movdqa 192(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: packssdw %xmm6, %xmm1
-; SSE-NEXT: psrad $16, %xmm14
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm14, %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: packssdw %xmm4, %xmm1
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm6
-; SSE-NEXT: packssdw %xmm0, %xmm6
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm14
-; SSE-NEXT: packssdw %xmm0, %xmm14
-; SSE-NEXT: psrad $16, %xmm3
-; SSE-NEXT: psrad $16, %xmm5
-; SSE-NEXT: packssdw %xmm3, %xmm5
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm6
-; SSE-NEXT: packssdw %xmm0, %xmm6
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm3
-; SSE-NEXT: packssdw %xmm0, %xmm3
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm0, %xmm4
+; SSE-NEXT: psrad $16, %xmm7
+; SSE-NEXT: psrad $16, %xmm8
+; SSE-NEXT: packssdw %xmm7, %xmm8
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm7
; SSE-NEXT: packssdw %xmm0, %xmm7
-; SSE-NEXT: psrad $16, %xmm8
+; SSE-NEXT: psrad $16, %xmm6
; SSE-NEXT: psrad $16, %xmm2
-; SSE-NEXT: packssdw %xmm8, %xmm2
+; SSE-NEXT: packssdw %xmm6, %xmm2
+; SSE-NEXT: psrad $16, %xmm12
+; SSE-NEXT: psrad $16, %xmm5
+; SSE-NEXT: packssdw %xmm12, %xmm5
; SSE-NEXT: movdqa %xmm1, 96(%rsi)
-; SSE-NEXT: movdqa %xmm4, 32(%rsi)
-; SSE-NEXT: movdqa %xmm15, 112(%rsi)
-; SSE-NEXT: movdqa %xmm13, 48(%rsi)
-; SSE-NEXT: movdqa %xmm11, 64(%rsi)
-; SSE-NEXT: movdqa %xmm9, (%rsi)
-; SSE-NEXT: movdqa %xmm10, 80(%rsi)
-; SSE-NEXT: movdqa %xmm12, 16(%rsi)
+; SSE-NEXT: movdqa %xmm3, 112(%rsi)
+; SSE-NEXT: movdqa %xmm15, 64(%rsi)
+; SSE-NEXT: movdqa %xmm14, 80(%rsi)
+; SSE-NEXT: movdqa %xmm11, 32(%rsi)
+; SSE-NEXT: movdqa %xmm9, 48(%rsi)
+; SSE-NEXT: movdqa %xmm10, (%rsi)
+; SSE-NEXT: movdqa %xmm13, 16(%rsi)
+; SSE-NEXT: movdqa %xmm5, 112(%rdx)
; SSE-NEXT: movdqa %xmm2, 96(%rdx)
-; SSE-NEXT: movdqa %xmm7, 112(%rdx)
-; SSE-NEXT: movdqa %xmm3, 64(%rdx)
-; SSE-NEXT: movdqa %xmm6, 80(%rdx)
-; SSE-NEXT: movdqa %xmm5, 32(%rdx)
-; SSE-NEXT: movdqa %xmm14, 48(%rdx)
+; SSE-NEXT: movdqa %xmm7, 80(%rdx)
+; SSE-NEXT: movdqa %xmm8, 64(%rdx)
+; SSE-NEXT: movdqa %xmm4, 48(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, (%rdx)
+; SSE-NEXT: movaps %xmm0, 32(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 16(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index 8eb26687600404..22e353f2502d66 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -1138,11 +1138,11 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-LABEL: load_i16_stride4_vf32:
; SSE: # %bb.0:
; SSE-NEXT: subq $248, %rsp
-; SSE-NEXT: movdqa 224(%rdi), %xmm3
+; SSE-NEXT: movdqa 160(%rdi), %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 192(%rdi), %xmm4
+; SSE-NEXT: movdqa 128(%rdi), %xmm4
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 208(%rdi), %xmm5
+; SSE-NEXT: movdqa 144(%rdi), %xmm5
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 96(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1162,22 +1162,22 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa 240(%rdi), %xmm0
+; SSE-NEXT: movdqa 176(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1185,8 +1185,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 16(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -1194,33 +1194,33 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 48(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 128(%rdi), %xmm0
+; SSE-NEXT: movdqa 192(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 144(%rdi), %xmm1
+; SSE-NEXT: movdqa 208(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movdqa 160(%rdi), %xmm2
+; SSE-NEXT: movdqa 224(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 176(%rdi), %xmm1
+; SSE-NEXT: movdqa 240(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
-; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm2[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
+; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
; SSE-NEXT: pshuflw $237, (%rsp), %xmm1 # 16-byte Folded Reload
@@ -1228,33 +1228,32 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
-; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
+; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
-; SSE-NEXT: movapd %xmm6, (%rsp) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
+; SSE-NEXT: movapd %xmm9, (%rsp) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,3,2,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
-; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm6[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm2[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1267,12 +1266,12 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; SSE-NEXT: # xmm11 = mem[3,1,2,3]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; SSE-NEXT: # xmm9 = mem[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1288,37 +1287,38 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
; SSE-NEXT: # xmm14 = mem[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[0,1,2,0,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[0,1,2,0,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT: # xmm13 = mem[3,1,2,3]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT: # xmm1 = mem[3,1,2,3]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; SSE-NEXT: # xmm9 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT: # xmm12 = mem[3,1,2,3]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT: # xmm10 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[0,1,2,0,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = mem[3,1,2,3]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
; SSE-NEXT: # xmm7 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,1,2,0,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
; SSE-NEXT: # xmm6 = mem[3,1,2,3]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; SSE-NEXT: # xmm5 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT: # xmm4 = mem[3,1,2,3]
; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; SSE-NEXT: # xmm3 = mem[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
@@ -1329,9 +1329,9 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1]
; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
@@ -1344,42 +1344,42 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1]
; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,1,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 32(%rsi)
+; SSE-NEXT: movaps %xmm0, 48(%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, (%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 48(%rsi)
+; SSE-NEXT: movaps %xmm0, 32(%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 16(%rsi)
-; SSE-NEXT: movapd %xmm12, 32(%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, (%rdx)
+; SSE-NEXT: movapd %xmm15, 48(%rdx)
+; SSE-NEXT: movapd %xmm13, (%rdx)
; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, 48(%rdx)
+; SSE-NEXT: movaps %xmm0, 32(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 16(%rdx)
-; SSE-NEXT: movapd %xmm1, 32(%rcx)
-; SSE-NEXT: movapd %xmm8, (%rcx)
-; SSE-NEXT: movapd %xmm15, 48(%rcx)
-; SSE-NEXT: movapd %xmm10, 16(%rcx)
-; SSE-NEXT: movapd %xmm3, 32(%r8)
-; SSE-NEXT: movapd %xmm7, (%r8)
-; SSE-NEXT: movapd %xmm14, 48(%r8)
-; SSE-NEXT: movapd %xmm11, 16(%r8)
+; SSE-NEXT: movapd %xmm1, 48(%rcx)
+; SSE-NEXT: movapd %xmm4, 32(%rcx)
+; SSE-NEXT: movapd %xmm11, 16(%rcx)
+; SSE-NEXT: movapd %xmm5, (%rcx)
+; SSE-NEXT: movapd %xmm3, 48(%r8)
+; SSE-NEXT: movapd %xmm10, 32(%r8)
+; SSE-NEXT: movapd %xmm14, 16(%r8)
+; SSE-NEXT: movapd %xmm9, (%r8)
; SSE-NEXT: addq $248, %rsp
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
index 7bb842df05dd4d..53e01cc2f58099 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
@@ -249,64 +249,64 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
; SSE-LABEL: load_i32_stride2_vf32:
; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: movaps 32(%rdi), %xmm1
-; SSE-NEXT: movaps 48(%rdi), %xmm8
-; SSE-NEXT: movaps 208(%rdi), %xmm9
-; SSE-NEXT: movaps 192(%rdi), %xmm3
+; SSE-NEXT: movaps (%rdi), %xmm1
+; SSE-NEXT: movaps 16(%rdi), %xmm8
+; SSE-NEXT: movaps 32(%rdi), %xmm0
+; SSE-NEXT: movaps 208(%rdi), %xmm11
+; SSE-NEXT: movaps 192(%rdi), %xmm2
+; SSE-NEXT: movaps 240(%rdi), %xmm10
+; SSE-NEXT: movaps 224(%rdi), %xmm4
+; SSE-NEXT: movaps 144(%rdi), %xmm14
+; SSE-NEXT: movaps 128(%rdi), %xmm3
+; SSE-NEXT: movaps 176(%rdi), %xmm12
+; SSE-NEXT: movaps 160(%rdi), %xmm6
; SSE-NEXT: movaps 80(%rdi), %xmm13
-; SSE-NEXT: movaps 64(%rdi), %xmm2
-; SSE-NEXT: movaps 240(%rdi), %xmm11
-; SSE-NEXT: movaps 224(%rdi), %xmm5
-; SSE-NEXT: movaps 112(%rdi), %xmm14
-; SSE-NEXT: movaps 96(%rdi), %xmm4
-; SSE-NEXT: movaps 144(%rdi), %xmm12
-; SSE-NEXT: movaps 128(%rdi), %xmm6
-; SSE-NEXT: movaps 176(%rdi), %xmm15
-; SSE-NEXT: movaps 160(%rdi), %xmm7
-; SSE-NEXT: movaps %xmm4, %xmm10
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm14[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm14[1,3]
-; SSE-NEXT: movaps %xmm2, %xmm14
-; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm13[1,3]
+; SSE-NEXT: movaps 64(%rdi), %xmm5
+; SSE-NEXT: movaps 112(%rdi), %xmm15
+; SSE-NEXT: movaps 96(%rdi), %xmm7
+; SSE-NEXT: movaps %xmm5, %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm13[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm13[1,3]
; SSE-NEXT: movaps %xmm7, %xmm13
; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm15[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm15[1,3]
-; SSE-NEXT: movaps %xmm6, %xmm15
-; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm12[0,2]
+; SSE-NEXT: movaps %xmm3, %xmm15
+; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm14[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm14[1,3]
+; SSE-NEXT: movaps %xmm6, %xmm14
+; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm12[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm12[1,3]
-; SSE-NEXT: movaps %xmm5, %xmm12
+; SSE-NEXT: movaps %xmm2, %xmm12
; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm11[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm11[1,3]
-; SSE-NEXT: movaps %xmm3, %xmm11
-; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm9[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm9[1,3]
-; SSE-NEXT: movaps %xmm1, %xmm9
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm8[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm11[1,3]
+; SSE-NEXT: movaps %xmm4, %xmm11
+; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm10[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm10[1,3]
+; SSE-NEXT: movaps %xmm1, %xmm10
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movaps 16(%rdi), %xmm8
+; SSE-NEXT: movaps 48(%rdi), %xmm8
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm8[1,3]
-; SSE-NEXT: movaps %xmm11, 96(%rsi)
-; SSE-NEXT: movaps %xmm14, 32(%rsi)
-; SSE-NEXT: movaps %xmm12, 112(%rsi)
-; SSE-NEXT: movaps %xmm10, 48(%rsi)
+; SSE-NEXT: movaps %xmm12, 96(%rsi)
+; SSE-NEXT: movaps %xmm11, 112(%rsi)
; SSE-NEXT: movaps %xmm15, 64(%rsi)
-; SSE-NEXT: movaps %xmm1, (%rsi)
-; SSE-NEXT: movaps %xmm13, 80(%rsi)
-; SSE-NEXT: movaps %xmm9, 16(%rsi)
-; SSE-NEXT: movaps %xmm3, 96(%rdx)
-; SSE-NEXT: movaps %xmm5, 112(%rdx)
-; SSE-NEXT: movaps %xmm6, 64(%rdx)
-; SSE-NEXT: movaps %xmm7, 80(%rdx)
-; SSE-NEXT: movaps %xmm2, 32(%rdx)
-; SSE-NEXT: movaps %xmm4, 48(%rdx)
-; SSE-NEXT: movaps %xmm0, (%rdx)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm14, 80(%rsi)
+; SSE-NEXT: movaps %xmm9, 32(%rsi)
+; SSE-NEXT: movaps %xmm13, 48(%rsi)
+; SSE-NEXT: movaps %xmm10, (%rsi)
+; SSE-NEXT: movaps %xmm1, 16(%rsi)
+; SSE-NEXT: movaps %xmm4, 112(%rdx)
+; SSE-NEXT: movaps %xmm2, 96(%rdx)
+; SSE-NEXT: movaps %xmm6, 80(%rdx)
+; SSE-NEXT: movaps %xmm3, 64(%rdx)
+; SSE-NEXT: movaps %xmm7, 48(%rdx)
+; SSE-NEXT: movaps %xmm5, 32(%rdx)
; SSE-NEXT: movaps %xmm0, 16(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: retq
;
; AVX1-ONLY-LABEL: load_i32_stride2_vf32:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
index 5d7df755956295..a2e7633f69554f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
@@ -294,30 +294,30 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm5[1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3
-; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5
+; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3
+; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm5
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm3[0]
-; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7
-; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8
+; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7
+; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1]
-; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm7
-; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8
+; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7
+; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm7[0]
-; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11
-; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12
+; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11
+; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm11[1]
-; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%r8)
-; AVX1-ONLY-NEXT: vmovaps %xmm7, 32(%r8)
-; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%r8)
-; AVX1-ONLY-NEXT: vmovaps %xmm10, (%r8)
-; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%r8)
-; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%r8)
-; AVX1-ONLY-NEXT: vmovaps %xmm9, 144(%r8)
-; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%r8)
+; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%r8)
+; AVX1-ONLY-NEXT: vmovaps %xmm7, 48(%r8)
+; AVX1-ONLY-NEXT: vmovaps %xmm13, (%r8)
+; AVX1-ONLY-NEXT: vmovaps %xmm10, 16(%r8)
+; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%r8)
+; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%r8)
+; AVX1-ONLY-NEXT: vmovaps %xmm9, 128(%r8)
+; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%r8)
; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8)
; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8)
; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index 99da0be38c2b49..e52fd4013bf466 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -164,11 +164,11 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6
; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3]
; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1
-; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm5[0],mem[0]
+; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%r9)
-; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9)
+; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9)
+; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9)
; AVX1-ONLY-NEXT: vmovapd %ymm4, 64(%r9)
; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%r9)
; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%r9)
@@ -378,10 +378,10 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0]
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%r9)
+; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0]
; AVX1-ONLY-NEXT: vmovaps %xmm7, (%r9)
+; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%r9)
; AVX1-ONLY-NEXT: vmovaps %xmm10, 160(%r9)
; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%r9)
; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
index 87b5732cc1aa32..440d3b96d2b0f0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
@@ -248,14 +248,14 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm7
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm8[1],xmm7[1]
+; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11
+; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12
+; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm11[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0]
-; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8
-; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11
-; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm11[1],xmm8[1]
-; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm8[0]
+; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm12[0],xmm11[0]
; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax)
-; AVX1-ONLY-NEXT: vmovaps %xmm12, 80(%rax)
; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rax)
+; AVX1-ONLY-NEXT: vmovaps %xmm13, 80(%rax)
; AVX1-ONLY-NEXT: vmovaps %xmm10, 64(%rax)
; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rax)
; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rax)
More information about the llvm-commits
mailing list