[llvm] [DAGCombiner] Don't drop atomic property of original load. (PR #75626)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 15 09:04:56 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-selectiondag
@llvm/pr-subscribers-backend-x86
Author: Jonas Paulsson (JonPsson1)
<details>
<summary>Changes</summary>
It seems that visitBITCAST() fails to pass on the atomic property of the load it is converting:
```
Legalized selection DAG: %bb.0 'f1:'
SelectionDAG has 12 nodes:
t0: ch,glue = EntryToken
t6: f32 = bitcast t12
t4: f32,ch = CopyFromReg t0, Register:f32 %1
t7: f32 = fadd t6, t4
t9: ch,glue = CopyToReg t12:1, Register:f32 $f0s, t7
t2: i64,ch = CopyFromReg t0, Register:i64 %0
t12: i32,ch = load<(volatile load seq_cst (s32) from %ir.src)> t0, t2, undef:i64
t10: ch = SystemZISD::RET_GLUE t9, Register:f32 $f0s, t9:1
Optimized legalized selection DAG: %bb.0 'f1:'
SelectionDAG has 11 nodes:
t0: ch,glue = EntryToken
t4: f32,ch = CopyFromReg t0, Register:f32 %1
t7: f32 = fadd t13, t4
t9: ch,glue = CopyToReg t13:1, Register:f32 $f0s, t7
t2: i64,ch = CopyFromReg t0, Register:i64 %0
t13: f32,ch = load<(volatile load (s32) from %ir.src)> t0, t2, undef:i64
t10: ch = SystemZISD::RET_GLUE t9, Register:f32 $f0s, t9:1
```
This patch simply passes the original MMO instead of different individual values, which looks ok given that values all come from the MMO. This way, the seq_cst bit is not missed.
I expected NFC, but got some X86 test failures, which I auto-updated. I got warnings from the update_llc_test_checks.py scripts with these tests:
```
X86/vector-interleaved-load-i16-stride-4.ll
X86/vector-interleaved-store-i64-stride-4.ll
X86/vector-interleaved-store-i64-stride-5.ll
X86/vector-interleaved-store-i64-stride-8.ll
WARNING: Prefix AVX had conflicting output from different RUN lines for all functions in test ...
```
---
Patch is 41.99 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75626.diff
8 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+1-2)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll (+81-79)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll (+102-102)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll (+45-45)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll (+16-16)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll (+5-5)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1d4bfa6fde0352..5934e3ca1b7219 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15165,8 +15165,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
*LN0->getMemOperand())) {
SDValue Load =
DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
- LN0->getPointerInfo(), LN0->getAlign(),
- LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
+ LN0->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
return Load;
}
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 6d5f8a78cb1d70..99d9f6b41e70dd 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3647,10 +3647,10 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vzeroupper
@@ -3833,10 +3833,10 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index eeea912a56a69a..04fd6d9300c18d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -501,136 +501,138 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; SSE-LABEL: load_i16_stride2_vf64:
; SSE: # %bb.0:
; SSE-NEXT: subq $40, %rsp
-; SSE-NEXT: movdqa 96(%rdi), %xmm13
-; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 112(%rdi), %xmm3
-; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 128(%rdi), %xmm11
-; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 144(%rdi), %xmm2
+; SSE-NEXT: movdqa 160(%rdi), %xmm14
+; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 176(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 160(%rdi), %xmm10
-; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill
-; SSE-NEXT: movdqa 176(%rdi), %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa (%rdi), %xmm9
-; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 16(%rdi), %xmm1
+; SSE-NEXT: movdqa 64(%rdi), %xmm11
+; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 80(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 32(%rdi), %xmm12
-; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 48(%rdi), %xmm14
-; SSE-NEXT: movdqa %xmm14, %xmm0
+; SSE-NEXT: movdqa 96(%rdi), %xmm9
+; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 112(%rdi), %xmm4
+; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
+; SSE-NEXT: movdqa (%rdi), %xmm10
+; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 16(%rdi), %xmm7
+; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 32(%rdi), %xmm13
+; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa 48(%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: pslld $16, %xmm12
-; SSE-NEXT: psrad $16, %xmm12
-; SSE-NEXT: packssdw %xmm0, %xmm12
-; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: pslld $16, %xmm13
+; SSE-NEXT: psrad $16, %xmm13
+; SSE-NEXT: packssdw %xmm0, %xmm13
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: pslld $16, %xmm10
; SSE-NEXT: psrad $16, %xmm10
; SSE-NEXT: packssdw %xmm0, %xmm10
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm4, %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: pslld $16, %xmm9
; SSE-NEXT: psrad $16, %xmm9
; SSE-NEXT: packssdw %xmm0, %xmm9
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: pslld $16, %xmm11
; SSE-NEXT: psrad $16, %xmm11
; SSE-NEXT: packssdw %xmm0, %xmm11
-; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: pslld $16, %xmm13
-; SSE-NEXT: psrad $16, %xmm13
-; SSE-NEXT: packssdw %xmm0, %xmm13
-; SSE-NEXT: movdqa 240(%rdi), %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pslld $16, %xmm14
+; SSE-NEXT: psrad $16, %xmm14
+; SSE-NEXT: packssdw %xmm0, %xmm14
+; SSE-NEXT: movdqa 144(%rdi), %xmm7
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: movdqa 224(%rdi), %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm15
+; SSE-NEXT: movdqa 128(%rdi), %xmm8
+; SSE-NEXT: movdqa %xmm8, %xmm15
; SSE-NEXT: pslld $16, %xmm15
; SSE-NEXT: psrad $16, %xmm15
; SSE-NEXT: packssdw %xmm0, %xmm15
-; SSE-NEXT: movdqa 80(%rdi), %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: movdqa 240(%rdi), %xmm12
+; SSE-NEXT: movdqa %xmm12, %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: movdqa 64(%rdi), %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm4
+; SSE-NEXT: movdqa 224(%rdi), %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm3
+; SSE-NEXT: pslld $16, %xmm3
+; SSE-NEXT: psrad $16, %xmm3
+; SSE-NEXT: packssdw %xmm1, %xmm3
+; SSE-NEXT: movdqa 208(%rdi), %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm4
; SSE-NEXT: pslld $16, %xmm4
; SSE-NEXT: psrad $16, %xmm4
-; SSE-NEXT: packssdw %xmm1, %xmm4
-; SSE-NEXT: movdqa 208(%rdi), %xmm8
-; SSE-NEXT: movdqa %xmm8, %xmm6
-; SSE-NEXT: pslld $16, %xmm6
-; SSE-NEXT: psrad $16, %xmm6
; SSE-NEXT: movdqa 192(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: packssdw %xmm6, %xmm1
-; SSE-NEXT: psrad $16, %xmm14
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm14, %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: packssdw %xmm4, %xmm1
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm6
-; SSE-NEXT: packssdw %xmm0, %xmm6
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm14
-; SSE-NEXT: packssdw %xmm0, %xmm14
-; SSE-NEXT: psrad $16, %xmm3
-; SSE-NEXT: psrad $16, %xmm5
-; SSE-NEXT: packssdw %xmm3, %xmm5
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm6
-; SSE-NEXT: packssdw %xmm0, %xmm6
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT: psrad $16, %xmm3
-; SSE-NEXT: packssdw %xmm0, %xmm3
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT: psrad $16, %xmm4
+; SSE-NEXT: packssdw %xmm0, %xmm4
+; SSE-NEXT: psrad $16, %xmm7
+; SSE-NEXT: psrad $16, %xmm8
+; SSE-NEXT: packssdw %xmm7, %xmm8
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; SSE-NEXT: psrad $16, %xmm7
; SSE-NEXT: packssdw %xmm0, %xmm7
-; SSE-NEXT: psrad $16, %xmm8
+; SSE-NEXT: psrad $16, %xmm6
; SSE-NEXT: psrad $16, %xmm2
-; SSE-NEXT: packssdw %xmm8, %xmm2
+; SSE-NEXT: packssdw %xmm6, %xmm2
+; SSE-NEXT: psrad $16, %xmm12
+; SSE-NEXT: psrad $16, %xmm5
+; SSE-NEXT: packssdw %xmm12, %xmm5
; SSE-NEXT: movdqa %xmm1, 96(%rsi)
-; SSE-NEXT: movdqa %xmm4, 32(%rsi)
-; SSE-NEXT: movdqa %xmm15, 112(%rsi)
-; SSE-NEXT: movdqa %xmm13, 48(%rsi)
-; SSE-NEXT: movdqa %xmm11, 64(%rsi)
-; SSE-NEXT: movdqa %xmm9, (%rsi)
-; SSE-NEXT: movdqa %xmm10, 80(%rsi)
-; SSE-NEXT: movdqa %xmm12, 16(%rsi)
+; SSE-NEXT: movdqa %xmm3, 112(%rsi)
+; SSE-NEXT: movdqa %xmm15, 64(%rsi)
+; SSE-NEXT: movdqa %xmm14, 80(%rsi)
+; SSE-NEXT: movdqa %xmm11, 32(%rsi)
+; SSE-NEXT: movdqa %xmm9, 48(%rsi)
+; SSE-NEXT: movdqa %xmm10, (%rsi)
+; SSE-NEXT: movdqa %xmm13, 16(%rsi)
+; SSE-NEXT: movdqa %xmm5, 112(%rdx)
; SSE-NEXT: movdqa %xmm2, 96(%rdx)
-; SSE-NEXT: movdqa %xmm7, 112(%rdx)
-; SSE-NEXT: movdqa %xmm3, 64(%rdx)
-; SSE-NEXT: movdqa %xmm6, 80(%rdx)
-; SSE-NEXT: movdqa %xmm5, 32(%rdx)
-; SSE-NEXT: movdqa %xmm14, 48(%rdx)
+; SSE-NEXT: movdqa %xmm7, 80(%rdx)
+; SSE-NEXT: movdqa %xmm8, 64(%rdx)
+; SSE-NEXT: movdqa %xmm4, 48(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, (%rdx)
+; SSE-NEXT: movaps %xmm0, 32(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 16(%rdx)
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index 8eb26687600404..22e353f2502d66 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -1138,11 +1138,11 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-LABEL: load_i16_stride4_vf32:
; SSE: # %bb.0:
; SSE-NEXT: subq $248, %rsp
-; SSE-NEXT: movdqa 224(%rdi), %xmm3
+; SSE-NEXT: movdqa 160(%rdi), %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 192(%rdi), %xmm4
+; SSE-NEXT: movdqa 128(%rdi), %xmm4
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 208(%rdi), %xmm5
+; SSE-NEXT: movdqa 144(%rdi), %xmm5
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 96(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1162,22 +1162,22 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa 240(%rdi), %xmm0
+; SSE-NEXT: movdqa 176(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1185,8 +1185,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 16(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -1194,33 +1194,33 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 48(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 128(%rdi), %xmm0
+; SSE-NEXT: movdqa 192(%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 144(%rdi), %xmm1
+; SSE-NEXT: movdqa 208(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movdqa 160(%rdi), %xmm2
+; SSE-NEXT: movdqa 224(%rdi), %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa 176(%rdi), %xmm1
+; SSE-NEXT: movdqa 240(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
-; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm2[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
+; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
; SSE-NEXT: pshuflw $237, (%rsp), %xmm1 # 16-byte Folded Reload
@@ -1228,33 +1228,32 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
-; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
+; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
+; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
-; SSE-NEXT: movapd %xmm6, (%rsp) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/75626
More information about the llvm-commits
mailing list